Exemple #1
0
void swanLoadProgramFromSource(  const char *module, const unsigned  char *ptx, size_t len , int devtype ) {
	int i=0;
	CUresult err;
	try_init();
	// let's see whether this module is already loaded
	for( i=0; i < state.num_mods; i++ ) {
		if( !strcmp( state.mod_names[i], module ) ) {
			return; // already loaded
		}
	} 	

	if( ptx == NULL || len == 0 ) {
		fprintf ( stderr, "SWAN : Module load failure [%s]. No source \n", module );
		error( "Module source invalid" );
	}

	i = state.num_mods;
	state.num_mods++;
	state.mods         = (CUmodule*) realloc( state.mods, state.num_mods * sizeof(CUmodule) );
	state.mod_names    = (char**) realloc( state.mod_names, state.num_mods * sizeof(char*) );
	state.mod_names[i] = (char*) malloc( strlen( module ) + 1 );
	strcpy( state.mod_names[i], module );

	// now load the PTX into a module
	err = cuModuleLoadData( &state.mods[i], ptx );
	if( err != CUDA_SUCCESS ) {
		fprintf ( stderr, "SWAN : Module load result [%s] [%d]\n", module, err );
		error( "Module load failed\n" );
	}
	
}
Exemple #2
0
int main(){
	init_test();
	const std::string source = 
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry kernel(.param .u64 kernel_param_0) {\n"
	".reg .s32 	%r<2>;\n"
	".reg .s64 	%rd<3>;\n"
	"bra 	BB1_2;\n"
	"ld.param.u64 	%rd1, [kernel_param_0];\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"mov.u32 	%r1, 5;\n"
	"st.global.u32 	[%rd2], %r1;\n"
	"BB1_2: ret;\n"
	"}\n";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel"));
	CUdeviceptr devValue;
	int hostValue = 10;
	cu_assert(cuMemAlloc(&devValue, sizeof(int)));
	cu_assert(cuMemcpyHtoD(devValue, &hostValue, sizeof(hostValue)));
	void * params[] = {&devValue};
	cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr));
	cu_assert(cuMemcpyDtoH(&hostValue, devValue, sizeof(hostValue)));
	assert(hostValue == 10);
	std::cout << hostValue << "\n";
	cu_assert(cuMemFree(devValue));
	cu_assert(cuModuleUnload(modId));
	return 0;
}
/*
 * This function load the ptx file ptxPath and extract the kernel kName
 * to phKernel
 * @param phKernel    Output kernel handle
 * @param ptxPath     ptx file name
 * @param kName       kernel name
 */
void ptxJIT(CUmodule *phModule, CUfunction *phKernel, const char *ptxPath, const char *kName)
{
  CUlinkState cuLinkState;
  CUjit_option options[6];
  void *optionVals[6];
  float walltime;
  char error_log[8192], info_log[8192];
  unsigned int logSize = 8192;
  void *cuOut;
  size_t outSize;
  int myErr = 0;

  // Setup linker options
  // Return walltime from JIT compilation
  options[0] = CU_JIT_WALL_TIME;
  optionVals[0] = (void *) &walltime;
  // Pass a buffer for info messages
  options[1] = CU_JIT_INFO_LOG_BUFFER;
  optionVals[1] = (void *) info_log;
  // Pass the size of the info buffer
  options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
  optionVals[2] = (void *) (long)logSize;
  // Pass a buffer for error message
  options[3] = CU_JIT_ERROR_LOG_BUFFER;
  optionVals[3] = (void *) error_log;
  // Pass the size of the error buffer
  options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  optionVals[4] = (void *) (long) logSize;
  // Make the linker verbose
  options[5] = CU_JIT_LOG_VERBOSE;
  optionVals[5] = (void *) 1;

  // Create a pending linker invocation
  checkCudaErrors(cuLinkCreate(6,options, optionVals, &cuLinkState));

  // Load the ptx from the file
  myErr = cuLinkAddFile(cuLinkState, CU_JIT_INPUT_PTX, ptxPath, 0, 0, 0);
  if (myErr != CUDA_SUCCESS){
    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
    fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
  }

  // Complete the linker step
  checkCudaErrors(cuLinkComplete(cuLinkState, &cuOut, &outSize));

  // Linker walltime and info_log were requested in options above.
  printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime, info_log);

  // Load resulting cuBin into module
  checkCudaErrors(cuModuleLoadData(phModule, cuOut));

  // Locate the kernel entry point
  checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, kName));

  // Destroy the linker invocation
  checkCudaErrors(cuLinkDestroy(cuLinkState));
}
/**
 * This measures the overhead in launching a kernel function on each GPU in the
 * system.
 *
 * It does this by executing a small kernel (copying 1 value in global memory) a
 * very large number of times and taking the average execution time.  This
 * program uses the CUDA driver API.
 */
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  float x = 5.0f;
  for (int d = 0; d < count; d++) {
    CUdevice device;
    CU_ERROR_CHECK(cuDeviceGet(&device, d));

    CUcontext context;
    CU_ERROR_CHECK(cuCtxCreate(&context, 0, device));

    CUdeviceptr in, out;
    CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float)));
    CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float)));

    CUmodule module;
    CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes));

    CUfunction function;
    CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel"));

    void * params[] = { &in, &out };

    CUevent start, stop;
    CU_ERROR_CHECK(cuEventCreate(&start, 0));
    CU_ERROR_CHECK(cuEventCreate(&stop, 0));

    CU_ERROR_CHECK(cuEventRecord(start, 0));
    for (int i = 0; i < ITERATIONS; i++)
      CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL));

    CU_ERROR_CHECK(cuEventRecord(stop, 0));
    CU_ERROR_CHECK(cuEventSynchronize(stop));

    float time;
    CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));

    CU_ERROR_CHECK(cuEventDestroy(start));
    CU_ERROR_CHECK(cuEventDestroy(stop));

    CU_ERROR_CHECK(cuMemFree(in));
    CU_ERROR_CHECK(cuMemFree(out));

    fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS));

    CU_ERROR_CHECK(cuModuleUnload(module));

    CU_ERROR_CHECK(cuCtxDestroy(context));
  }

  return 0;
}
Exemple #5
0
/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    loadFunction
 * Signature: ()V
 */
JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction
  (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jobject buffers, jint size, 
   jint total_size, jint num_blocks){

  void * cubin_file;
  int offset;
  CUresult status;
  heapEndPtr = heap_end_ptr;
  
  //void * cubin_file = readCubinFile("code_file.cubin");
  cubin_file = readCubinFileFromBuffers(env, buffers, size, total_size);
  status = cuModuleLoadData(&cuModule, cubin_file);
  CHECK_STATUS(env,"error in cuModuleLoad",status)
  
  free(cubin_file);

  status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_i"); 
  CHECK_STATUS(env,"error in cuModuleGetFunction",status)

  status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1);
  CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status)

  status = cuParamSetSize(cuFunction, (6 * sizeof(CUdeviceptr) + sizeof(int))); 
  CHECK_STATUS(env,"error in cuParamSetSize",status)

  offset = 0;
  status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr));
  CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status)
  offset += sizeof(CUdeviceptr); 

  status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSeti(cuFunction, offset, num_blocks); 
  CHECK_STATUS(env,"error in cuParamSetv num_blocks",status)
  offset += sizeof(int);
}
Exemple #6
0
SEXP
R_auto_cuModuleLoadData(SEXP r_image)
{
    SEXP r_ans = R_NilValue;
    CUmodule module;
    const void * image = GET_REF(r_image, const void );
    CUresult ans;
    ans = cuModuleLoadData(& module,  image);
    if(ans)
       return(R_cudaErrorInfo(ans));
    r_ans = R_createRef(module, "CUmodule") ;
    return(r_ans);
}
Exemple #7
0
  kernel_t<CUDA>* kernel_t<CUDA>::loadFromLibrary(const char *cache,
                                                  const std::string &functionName_){
    OCCA_EXTRACT_DATA(CUDA, Kernel);

    functionName = functionName_;

    OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module",
                    cuModuleLoadData(&data_.module, cache));

    OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function",
                    cuModuleGetFunction(&data_.function, data_.module, functionName.c_str()));

    return this;
  }
Exemple #8
0
WEAK int halide_init_kernels(void *user_context, void **state_ptr, const char* ptx_src, int size) {
    DEBUG_PRINTF( user_context, "CUDA: halide_init_kernels (user_context: %p, state_ptr: %p, ptx_src: %p, %i)\n",
                  user_context, state_ptr, ptx_src, size );

    CudaContext ctx(user_context);
    if (ctx.error != 0) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    // Create the state object if necessary. This only happens once, regardless
    // of how many times halide_init_kernels/halide_release is called.
    // halide_release traverses this list and releases the module objects, but
    // it does not modify the list nodes created/inserted here.
    module_state **state = (module_state**)state_ptr;
    if (!(*state)) {
        *state = (module_state*)malloc(sizeof(module_state));
        (*state)->module = NULL;
        (*state)->next = state_list;
        state_list = *state;
    }

    // Create the module itself if necessary.
    if (!(*state)->module) {
        DEBUG_PRINTF( user_context, "    cuModuleLoadData %p, %i -> ", ptx_src, size );
        CUmodule module;
        CUresult err = cuModuleLoadData(&(*state)->module, ptx_src);
        if (err != CUDA_SUCCESS) {
            DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err) );
            halide_error_varargs(user_context, "CUDA: cuModuleLoadData failed (%s)",
                                 _get_error_name(err));
            return err;
        } else {
            DEBUG_PRINTF( user_context, "%p\n", module );
        }
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Exemple #9
0
int main(){
	init_test();
	const std::string test_source =
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry _Z6kernelPfi(\n"
	".param .u64 _Z6kernelPfi_param_0,\n"
	".param .u32 _Z6kernelPfi_param_1){\n"
	".reg .pred 	%p<2>;\n"
	".reg .f32 	%f<3>;\n"
	".reg .s32 	%r<3>;\n"
	".reg .s64 	%rd<5>;\n"
	"ld.param.u64 	%rd1, [_Z6kernelPfi_param_0];\n"
	"ld.param.u32 	%r2, [_Z6kernelPfi_param_1];\n"
	"mov.u32 	%r1, %tid.x;\n"
	"setp.ge.u32	%p1, %r1, %r2;\n"
	"@%p1 bra 	BB0_2;\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"cvt.rn.f32.u32	%f1, %r1;\n"
	"mul.f32 	%f2, %f1, 0f3FC00000;\n"
	"mul.wide.u32 	%rd3, %r1, 4;\n"
	"add.s64 	%rd4, %rd2, %rd3;\n"
	"st.global.f32 	[%rd4], %f2;\n"
	"BB0_2:\n"
	"ret;\n"
	"}";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, test_source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "_Z6kernelPfi"));
	CUdeviceptr devArray;
	int size = 10;
	float hostArray[size];
	memset(hostArray, 0, size * sizeof(hostArray[0]));
	cu_assert(cuMemAlloc(&devArray, sizeof(float) * size));
	void * params[] = {&devArray, &size};
	auto result = cuLaunchKernel(funcHandle, 1,1,1, size*2,1,1, 0,0, params, nullptr);
	cu_assert(result);
	cu_assert(cuMemcpyDtoH(&hostArray, devArray, sizeof(hostArray[0])*size));
	cu_assert(cuMemFree(devArray));
	cu_assert(cuModuleUnload(modId));
	for (int i=0 ; i<size ; ++i)
		std::cout << hostArray[i] << '\n';
	return 0;
}
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount,
                             long flags) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n");
#endif

    SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber]));

    if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) {
        SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, cudaDevice));
    } else {
        SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, cudaDevice));
    }


    if (kernelMap == NULL) {
        // kernels have not yet been initialized; do so now.  Hopefully, this only occurs once per library load.
        InitializeKernelMap();
    }

    int id = paddedStateCount;
    if (flags & BEAGLE_FLAG_PRECISION_DOUBLE) {
        id *= -1;
    }

    if (kernelMap->count(id) == 0) {
        fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount);
        exit(-1);
    }

    kernelResource = (*kernelMap)[id].copy();
    kernelResource->categoryCount = categoryCount;
    kernelResource->patternCount = paddedPatternCount;
    kernelResource->flags = flags;

    SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode));

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::SetDevice\n");
#endif

}
Exemple #11
0
int main(){
	init_test();
	const std::string source = 
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry kernel_4(\n"
	".param .u32 kernel_4_param_0,\n"
	".param .u64 kernel_4_param_1\n"
	")\n"
	"{\n"
	".reg .s32 	%r<3>;\n"
	".reg .s64 	%rd<3>;\n"
	"ld.param.u32 	%r1, [kernel_4_param_0];\n"
	"ld.param.u64 	%rd1, [kernel_4_param_1];\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"add.s32 	%r2, %r1, 7;\n"
	"st.global.u32 	[%rd2], %r2;\n"
	"ret;\n"
	"}";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel_4"));
	CUdeviceptr devValue;
	int hostValue = 10;
	cu_assert(cuMemAlloc(&devValue, sizeof(int)));
	void * params[] = {&hostValue, &devValue};
	cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr));
	int result = 0;
	cu_assert(cuMemcpyDtoH(&result, devValue, sizeof(result)));
	assert(result == hostValue + 7);
	std::cout << result << "\n";
	cu_assert(cuMemFree(devValue));
	cu_assert(cuModuleUnload(modId));
	return 0;
}
Exemple #12
0
WEAK void halide_init_kernels(const char* ptx_src) {
    // If the context pointer isn't hooked up yet, point it at this module's weak-linkage context.
    if (cuda_ctx_ptr == NULL) {
        cuda_ctx_ptr = &weak_cuda_ctx;
    }

    // Initialize one shared context for all Halide compiled instances
    if (*cuda_ctx_ptr == 0) {
        // Initialize CUDA
        CHECK_CALL( cuInit(0), "cuInit" );

        // Make sure we have a device
        int deviceCount = 0;
        CHECK_CALL( cuDeviceGetCount(&deviceCount), "cuDeviceGetCount" );
        assert(deviceCount > 0);
        
        char *device_str = getenv("HL_GPU_DEVICE");

        CUdevice dev;
        // Get device
        CUresult status;
        if (device_str) {
            status = cuDeviceGet(&dev, atoi(device_str));
        } else {
            for (int id = 2; id >= 0; id--) {
                // Try to get a device >0 first, since 0 should be our display device
                status = cuDeviceGet(&dev, id);
                if (status == CUDA_SUCCESS) break;
            }
        }

        if (status != CUDA_SUCCESS) {
            fprintf(stderr, "Failed to get device\n");
            exit(-1);
        }

        #ifndef NDEBUG
        fprintf(stderr, "Got device %d, about to create context (t=%d)\n", dev, halide_current_time());
        #endif


        // Create context
        CHECK_CALL( cuCtxCreate(cuda_ctx_ptr, 0, dev), "cuCtxCreate" );

    } else {
        //CHECK_CALL( cuCtxPushCurrent(*cuda_ctx_ptr), "cuCtxPushCurrent" );
    }
    
    // Initialize a module for just this Halide module
    if (!__mod) {
        // Create module
        CHECK_CALL( cuModuleLoadData(&__mod, ptx_src), "cuModuleLoadData" );

        #ifndef NDEBUG
        fprintf(stderr, "-------\nCompiling PTX:\n%s\n--------\n", ptx_src);
        #endif
    }

    // Create two events for timing
    if (!__start) {
        cuEventCreate(&__start, 0);
        cuEventCreate(&__end, 0);
    }
}
Exemple #13
0
static av_cold int cudascale_config_props(AVFilterLink *outlink)
{
    AVFilterContext *ctx = outlink->src;
    AVFilterLink *inlink = outlink->src->inputs[0];
    CUDAScaleContext *s  = ctx->priv;
    AVHWFramesContext     *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
    int w, h;
    int ret;

    extern char vf_scale_cuda_ptx[];

    ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx));
    if (ret < 0)
        goto fail;

    ret = CHECK_CU(cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx));
    if (ret < 0)
        goto fail;

    CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar"));
    CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2"));
    CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4"));
    CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort"));
    CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2"));
    CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4"));

    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex"));
    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex"));
    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex"));
    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex"));
    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex"));
    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort4, s->cu_module, "ushort4_tex"));

    CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER));
    CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER));
    CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER));
    CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER));
    CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER));
    CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort4, CU_TRSF_READ_AS_INTEGER));

    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR));
    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR));
    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR));
    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR));
    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR));
    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort4, CU_TR_FILTER_MODE_LINEAR));

    CHECK_CU(cuCtxPopCurrent(&dummy));

    if ((ret = ff_scale_eval_dimensions(s,
                                        s->w_expr, s->h_expr,
                                        inlink, outlink,
                                        &w, &h)) < 0)
        goto fail;

    if (((int64_t)h * inlink->w) > INT_MAX  ||
        ((int64_t)w * inlink->h) > INT_MAX)
        av_log(ctx, AV_LOG_ERROR, "Rescaled value for width or height is too big.\n");

    outlink->w = w;
    outlink->h = h;

    ret = init_processing_chain(ctx, inlink->w, inlink->h, w, h);
    if (ret < 0)
        return ret;

    av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d -> w:%d h:%d\n",
           inlink->w, inlink->h, outlink->w, outlink->h);

    if (inlink->sample_aspect_ratio.num) {
        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w,
                                                             outlink->w*inlink->h},
                                                inlink->sample_aspect_ratio);
    } else {
        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
    }

    return 0;

fail:
    return ret;
}
//------------------------------------------------------------------------------
void build(CUmodule& module,
           CUfunction& kernel,
           const std::vector< std::string >& files,
           const char* kernel_name) {


    CUjit_option options[] = {CU_JIT_WALL_TIME,
                              CU_JIT_INFO_LOG_BUFFER,
                              CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
                              CU_JIT_ERROR_LOG_BUFFER,
                              CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
                              CU_JIT_LOG_VERBOSE};
    float walltime = 0.f;
    const unsigned bufsize = 0x10000;
    char error_buf[bufsize] = "";
    char log_buf[bufsize] = "";
    const int verbose = 1;                          
    void* option_values[] = {(void*) &walltime,
                             (void*) log_buf, 
                             (void*) bufsize,
                             (void*) error_buf,
                             (void*) bufsize,
                             (void*) verbose};

    void* compiled_code = 0;
    size_t compiled_size = 0;
    int status = CUDA_SUCCESS - 1;
      
    CUlinkState link_state = CUlinkState();
    
    const int num_options = sizeof(options) / sizeof(CUjit_option);

    // Create a pending linker invocation
    CCHECK(cuLinkCreate(num_options,
                        options, option_values, &link_state));

    for(std::vector< std::string >::const_iterator i = files.begin();
        i != files.end();
        ++i) {
        status = cuLinkAddFile(link_state,
                             CU_JIT_INPUT_PTX, 
                             i->c_str(),
                             0, //num options
                             0, //options,
                             0); //option values
    }


    if( status != CUDA_SUCCESS ) {
        std::cerr << "PTX Linker Error:\n"<< error_buf << std::endl;
        exit(EXIT_FAILURE);
    }

    // Complete the linker step: compiled_code is filled with executable code
    //???: what do I do with the returned data ? can/should I delete it ?
    CCHECK(cuLinkComplete(link_state, &compiled_code, &compiled_size));
    assert(compiled_size > 0);
    assert(compiled_code);

    std::cout << "CUDA Link Completed in " << walltime << " ms\n"
              << log_buf << std::endl; 

    CCHECK(cuModuleLoadData(&module, compiled_code));

    CCHECK(cuModuleGetFunction(&kernel, module, kernel_name));

    CCHECK(cuLinkDestroy(link_state));
}
Exemple #15
0
static void
link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
	  unsigned num_objs)
{
  CUjit_option opts[7];
  void *optvals[7];
  float elapsed = 0.0;
#define LOGSIZE 8192
  char elog[LOGSIZE];
  char ilog[LOGSIZE];
  unsigned long logsize = LOGSIZE;
  CUlinkState linkstate;
  CUresult r;
  void *linkout;
  size_t linkoutsize __attribute__ ((unused));

  opts[0] = CU_JIT_WALL_TIME;
  optvals[0] = &elapsed;

  opts[1] = CU_JIT_INFO_LOG_BUFFER;
  optvals[1] = &ilog[0];

  opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
  optvals[2] = (void *) logsize;

  opts[3] = CU_JIT_ERROR_LOG_BUFFER;
  optvals[3] = &elog[0];

  opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  optvals[4] = (void *) logsize;

  opts[5] = CU_JIT_LOG_VERBOSE;
  optvals[5] = (void *) 1;

  opts[6] = CU_JIT_TARGET;
  optvals[6] = (void *) CU_TARGET_COMPUTE_30;

  r = cuLinkCreate (7, opts, optvals, &linkstate);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));

  for (; num_objs--; ptx_objs++)
    {
      /* cuLinkAddData's 'data' argument erroneously omits the const
	 qualifier.  */
      GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
      r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
			 ptx_objs->size, 0, 0, 0, 0);
      if (r != CUDA_SUCCESS)
	{
	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
	  GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s",
			     cuda_error (r));
	}
    }

  GOMP_PLUGIN_debug (0, "Linking\n");
  r = cuLinkComplete (linkstate, &linkout, &linkoutsize);

  GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
  GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);

  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));

  r = cuModuleLoadData (module, linkout);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));

  r = cuLinkDestroy (linkstate);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r));
}
Exemple #16
0
static CUmodule
build_kernel_source(const char *source_file, long target_capability)
{
	char		   *source;
	int				link_dev_runtime;
	nvrtcProgram	program;
	nvrtcResult		rc;
	char			arch_buf[128];
	const char	   *options[10];
	int				opt_index = 0;
	int				build_failure = 0;
	char		   *build_log;
	size_t			build_log_len;
	char		   *ptx_image;
	size_t			ptx_image_len;
	void		   *bin_image;
	size_t			bin_image_len;
	CUmodule		cuda_module;
	CUresult		cuda_rc;

	source = load_kernel_source(source_file, &link_dev_runtime);
	rc = nvrtcCreateProgram(&program,
							source,
							NULL,
							0,
							NULL,
							NULL);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcCreateProgram");

	/*
	 * Put command line options as cuda_program.c doing
	 */
	options[opt_index++] = "-I " CUDA_INCLUDE_PATH;
	snprintf(arch_buf, sizeof(arch_buf),
			 "--gpu-architecture=compute_%ld", target_capability);
	options[opt_index++] = arch_buf;
#ifdef PGSTROM_DEBUG
	options[opt_index++] = "--device-debug";
	options[opt_index++] = "--generate-line-info";
#endif
	options[opt_index++] = "--use_fast_math";
	if (link_dev_runtime)
		options[opt_index++] = "--relocatable-device-code=true";

	/*
	 * Kick runtime compiler
	 */
	rc = nvrtcCompileProgram(program, opt_index, options);
	if (rc != NVRTC_SUCCESS)
	{
		if (rc == NVRTC_ERROR_COMPILATION)
			build_failure = 1;
		else
			nvrtc_error(rc, "nvrtcCompileProgram");
	}

	/*
	 * Print build log
	 */
	rc = nvrtcGetProgramLogSize(program, &build_log_len);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetProgramLogSize");
	build_log = malloc(build_log_len + 1);
	if (!build_log)
	{
		fputs("out of memory", stderr);
		exit(1);
	}
	rc = nvrtcGetProgramLog(program, build_log);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetProgramLog");

	if (build_log_len > 1)
		printf("build log:\n%s\n", build_log);
	if (build_failure)
		exit(1);

	/*
	 * Get PTX Image
	 */
	rc = nvrtcGetPTXSize(program, &ptx_image_len);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetPTXSize");
	ptx_image = malloc(ptx_image_len + 1);
	if (!ptx_image)
	{
		fputs("out of memory", stderr);
		exit(1);
	}
	rc = nvrtcGetPTX(program, ptx_image);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetPTX");
	ptx_image[ptx_image_len] = '\0';

	/*
	 * Link device runtime if needed
	 */
	if (link_dev_runtime)
	{
		link_device_libraries(ptx_image, ptx_image_len,
							  &bin_image, &bin_image_len,
							  target_capability);
	}
	else
	{
		bin_image = ptx_image;
		bin_image_len = ptx_image_len;
	}

	cuda_rc = cuModuleLoadData(&cuda_module, bin_image);
	if (cuda_rc != CUDA_SUCCESS)
		cuda_error(rc, "cuModuleLoadData");
	return cuda_module;
}
Exemple #17
0
WEAK void halide_init_kernels(void *user_context, const char* ptx_src, int size) {
    // If the context pointer isn't hooked up yet, point it at this module's weak-linkage context.
    if (cuda_ctx_ptr == NULL) {
        cuda_ctx_ptr = &weak_cuda_ctx;
    }

    // Initialize one shared context for all Halide compiled instances
    if (*cuda_ctx_ptr == 0) {
        // Initialize CUDA
        CHECK_CALL( cuInit(0), "cuInit" );

        // Make sure we have a device
        int deviceCount = 0;
        CHECK_CALL( cuDeviceGetCount(&deviceCount), "cuDeviceGetCount" );
        halide_assert(user_context, deviceCount > 0);

        char *device_str = getenv("HL_GPU_DEVICE");

        CUdevice dev;
        // Get device
        CUresult status;
        if (device_str) {
            status = cuDeviceGet(&dev, atoi(device_str));
        } else {
            // Try to get a device >0 first, since 0 should be our display device
            // For now, don't try devices > 2 to maintain compatibility with previous behavior.
            if (deviceCount > 2)
                deviceCount = 2;
            for (int id = deviceCount - 1; id >= 0; id--) {
                status = cuDeviceGet(&dev, id);
                if (status == CUDA_SUCCESS) break;
            }
        }

        halide_assert(user_context, status == CUDA_SUCCESS && "Failed to get device\n");

        #ifdef DEBUG
        halide_printf(user_context, "Got device %d, about to create context (t=%lld)\n",
                      dev, (long long)halide_current_time_ns(user_context));
        #endif


        // Create context
        CHECK_CALL( cuCtxCreate(cuda_ctx_ptr, 0, dev), "cuCtxCreate" );
    } else {
        //CHECK_CALL( cuCtxPushCurrent(*cuda_ctx_ptr), "cuCtxPushCurrent" );
    }

    // Initialize a module for just this Halide module
    if (!__mod) {
        // Create module
        CHECK_CALL( cuModuleLoadData(&__mod, ptx_src), "cuModuleLoadData" );

        #ifdef DEBUG
        halide_printf(user_context, "-------\nCompiling PTX:\n%s\n--------\n",
                      ptx_src);
        #endif
    }

    // Create two events for timing
    if (!__start) {
        cuEventCreate(&__start, 0);
        cuEventCreate(&__end, 0);
    }
}
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount, int unpaddedPatternCount, int tipCount,
                             long flags) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n");
#endif

    SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber]));

    unsigned int ctxFlags = CU_CTX_SCHED_AUTO;

    if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) {
        ctxFlags |= CU_CTX_MAP_HOST;
    }

    CUresult error = cuCtxCreate(&cudaContext, ctxFlags, cudaDevice);
    if(error != CUDA_SUCCESS) {
        fprintf(stderr, "CUDA error: \"%s\" (%d) from file <%s>, line %i.\n",
                GetCUDAErrorDescription(error), error, __FILE__, __LINE__);
        if (error == CUDA_ERROR_INVALID_DEVICE) {
            fprintf(stderr, "(The requested CUDA device is likely set to compute exclusive mode. This mode prevents multiple processes from running on the device.)");
        }
        exit(-1);
    }

    InitializeKernelResource(paddedStateCount, flags & BEAGLE_FLAG_PRECISION_DOUBLE);

    if (!kernelResource) {
        fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount);
        exit(-1);
    }
    kernelResource->categoryCount = categoryCount;
    kernelResource->patternCount = paddedPatternCount;
    kernelResource->unpaddedPatternCount = unpaddedPatternCount;
    kernelResource->flags = flags;

    SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode));

    if ((paddedPatternCount < BEAGLE_MULTI_GRID_MAX || flags & BEAGLE_FLAG_PARALLELOPS_GRID) && !(flags & BEAGLE_FLAG_PARALLELOPS_STREAMS)) {
        numStreams = 1;
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        cudaStreams[0] = NULL;
        CUevent event;
        for(int i=0; i<2; i++) {
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
    } else {
        numStreams = tipCount/2 + 1;
        if (numStreams > BEAGLE_STREAM_COUNT) {
            numStreams = BEAGLE_STREAM_COUNT;
        }
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        CUstream stream;
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        CUevent event;
        for(int i=0; i<numStreams; i++) {
            SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT));
            cudaStreams[i] = stream;
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
        SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
        cudaEvents[numStreams] = event;
    }

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::SetDevice\n");
#endif

}
static gpukernel *cuda_newkernel(void *c, unsigned int count,
                                 const char **strings, const size_t *lengths,
                                 const char *fname, unsigned int argcount,
                                 const int *types, int flags, int *ret,
                                 char **err_str) {
    cuda_context *ctx = (cuda_context *)c;
    strb sb = STRB_STATIC_INIT;
    char *bin, *log = NULL;
    srckey k, *ak;
    binval *av;
    gpukernel *res;
    size_t bin_len = 0, log_len = 0;
    CUdevice dev;
    unsigned int i;
    int ptx_mode = 0;
    int binary_mode = 0;
    int major, minor;

    if (count == 0) FAIL(NULL, GA_VALUE_ERROR);

    if (flags & GA_USE_OPENCL)
      FAIL(NULL, GA_DEVSUP_ERROR);

    if (flags & GA_USE_BINARY) {
      // GA_USE_BINARY is exclusive
      if (flags & ~GA_USE_BINARY)
        FAIL(NULL, GA_INVALID_ERROR);
      // We need the length for binary data and there is only one blob.
      if (count != 1 || lengths == NULL || lengths[0] == 0)
        FAIL(NULL, GA_VALUE_ERROR);
    }

    cuda_enter(ctx);

    ctx->err = cuCtxGetDevice(&dev);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    ctx->err = cuDeviceComputeCapability(&major, &minor, dev);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    // GA_USE_CLUDA is done later
    // GA_USE_SMALL will always work
    if (flags & GA_USE_DOUBLE) {
      if (major < 1 || (major == 1 && minor < 3)) {
        cuda_exit(ctx);
        FAIL(NULL, GA_DEVSUP_ERROR);
      }
    }
    if (flags & GA_USE_COMPLEX) {
      // just for now since it is most likely broken
      cuda_exit(ctx);
      FAIL(NULL, GA_DEVSUP_ERROR);
    }
    // GA_USE_HALF should always work

    if (flags & GA_USE_PTX) {
      ptx_mode = 1;
    } else if (flags & GA_USE_BINARY) {
      binary_mode = 1;
    }

    if (binary_mode) {
      bin = memdup(strings[0], lengths[0]);
      bin_len = lengths[0];
      if (bin == NULL) {
        cuda_exit(ctx);
        FAIL(NULL, GA_MEMORY_ERROR);
      }
    } else {
      if (flags & GA_USE_CLUDA) {
        strb_appends(&sb, CUDA_PREAMBLE);
      }

      if (lengths == NULL) {
        for (i = 0; i < count; i++)
        strb_appends(&sb, strings[i]);
      } else {
        for (i = 0; i < count; i++) {
          if (lengths[i] == 0)
            strb_appends(&sb, strings[i]);
          else
            strb_appendn(&sb, strings[i], lengths[i]);
        }
      }

      strb_append0(&sb);

      if (strb_error(&sb)) {
        strb_clear(&sb);
        cuda_exit(ctx);
        return NULL;
      }

      if (ptx_mode) {
        bin = sb.s;
        bin_len = sb.l;
      } else {
        bin = NULL;
        if (compile_cache != NULL) {
          k.src = sb.s;
          k.len = sb.l;
          memcpy(k.arch, ctx->bin_id, BIN_ID_LEN);
          av = cache_get(compile_cache, &k);
          if (av != NULL) {
            bin = memdup(av->bin, av->len);
            bin_len = av->len;
          }
        }
        if (bin == NULL) {
          bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len,
                              &log, &log_len, ret);
        }
        if (bin == NULL) {
          if (err_str != NULL) {
            strb debug_msg = STRB_STATIC_INIT;

            // We're substituting debug_msg for a string with this first line:
            strb_appends(&debug_msg, "CUDA kernel build failure ::\n");

            /* Delete the final NUL */
            sb.l--;
            gpukernel_source_with_line_numbers(1, (const char **)&sb.s,
                                               &sb.l, &debug_msg);

            if (log != NULL) {
              strb_appends(&debug_msg, "\nCompiler log:\n");
              strb_appendn(&debug_msg, log, log_len);
              free(log);
            }
            *err_str = strb_cstr(&debug_msg);
            // *err_str will be free()d by the caller (see docs in kernel.h)
          }
          strb_clear(&sb);
          cuda_exit(ctx);
          return NULL;
        }
        if (compile_cache == NULL)
          compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free,
                                     bin_free);

        if (compile_cache != NULL) {
          ak = malloc(sizeof(*ak));
          av = malloc(sizeof(*av));
          if (ak == NULL || av == NULL) {
            free(ak);
            free(av);
            goto done;
          }
          ak->src = memdup(sb.s, sb.l);
          if (ak->src == NULL) {
            free(ak);
            free(av);
            goto done;
          }
          ak->len = sb.l;
          memmove(ak->arch, ctx->bin_id, BIN_ID_LEN);
          av->len = bin_len;
          av->bin = memdup(bin, bin_len);
          if (av->bin == NULL) {
            src_free(ak);
            free(av);
            goto done;
          }
          cache_add(compile_cache, ak, av);
        }
      done:
        strb_clear(&sb);
      }
    }

    res = calloc(1, sizeof(*res));
    if (res == NULL) {
      free(bin);
      cuda_exit(ctx);
      FAIL(NULL, GA_SYS_ERROR);
    }

    res->bin_sz = bin_len;
    res->bin = bin;

    res->refcnt = 1;
    res->argcount = argcount;
    res->types = calloc(argcount, sizeof(int));
    if (res->types == NULL) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_MEMORY_ERROR);
    }
    memcpy(res->types, types, argcount*sizeof(int));
    res->args = calloc(argcount, sizeof(void *));
    if (res->args == NULL) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_MEMORY_ERROR);
    }

    ctx->err = cuModuleLoadData(&res->m, bin);

    if (ctx->err != CUDA_SUCCESS) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    ctx->err = cuModuleGetFunction(&res->k, res->m, fname);
    if (ctx->err != CUDA_SUCCESS) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    res->ctx = ctx;
    ctx->refcnt++;
    cuda_exit(ctx);
    TAG_KER(res);
    return res;
}
Exemple #20
0
CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)
{
	return cuModuleLoadData(module, image);
}
Exemple #21
0
// load/read kernel from 'program' file/string, compile and return the requested function
CUresult ptxJIT(const char* program, const char* functionName, CUmodule *phModule, CUfunction *phKernel, CUlinkState *lState, bool bFromFile)
{
	CUjit_option options[6];
	void *optionVals[6];
	float walltime(0);
	const unsigned logSize(8192);
	char error_log[logSize], info_log[logSize];
	void *cuOut;
	size_t outSize;

	// Setup linker options
	// Return walltime from JIT compilation
	options[0] = CU_JIT_WALL_TIME;
	optionVals[0] = (void*)&walltime;
	// Pass a buffer for info messages
	options[1] = CU_JIT_INFO_LOG_BUFFER;
	optionVals[1] = (void*)info_log;
	// Pass the size of the info buffer
	options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
	optionVals[2] = (void*)(long)logSize;
	// Pass a buffer for error message
	options[3] = CU_JIT_ERROR_LOG_BUFFER;
	optionVals[3] = (void*)error_log;
	// Pass the size of the error buffer
	options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
	optionVals[4] = (void*)(long)logSize;
	// Make the linker verbose
	options[5] = CU_JIT_LOG_VERBOSE;
	optionVals[5] = (void*)1;

	// Create a pending linker invocation
	checkCudaErrors(cuLinkCreate(6, options, optionVals, lState));

	DEBUG("Loading '%s' program", functionName);
	CUresult myErr;
	if (bFromFile) {
		// Load the PTX from the file (64-bit)
		myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, program, 0, 0, 0);
	} else {
		// Load the PTX from the string myPtx (64-bit)
		myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)program, strlen(program)+1, 0, 0, 0, 0);
	}
	if (myErr != CUDA_SUCCESS) {
		// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
		VERBOSE("PTX Linker Error: %s", error_log);
		return myErr;
	}

	// Complete the linker step
	checkCudaErrors(cuLinkComplete(*lState, &cuOut, &outSize));

	// Linker walltime and info_log were requested in options above.
	DEBUG("CUDA link completed (%gms):\n%s", walltime, info_log);

	// Load resulting cuBin into module
	checkCudaErrors(cuModuleLoadData(phModule, cuOut));

	// Locate the kernel entry point
	checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, functionName));

	// Destroy the linker invocation
	checkCudaErrors(cuLinkDestroy(*lState));
	return CUDA_SUCCESS;
}
Exemple #22
0
T run_function(const std::string& name, const T input, const int shiftValue) {
	const std::string test_source =
	"//\n"
	"// Generated by NVIDIA NVVM Compiler\n"
	"//\n"
	"// Compiler Build ID: CL-19856038\n"
	"// Cuda compilation tools, release 7.5, V7.5.17\n"
	"// Based on LLVM 3.4svn\n"
	"//\n"
	"\n"
	".version 4.3\n"
	".target sm_20\n"
	".address_size 64\n"
	"\n"
	"	// .globl	_Z10kernel_s32Piii\n"
	"\n"
	".visible .entry _Z10kernel_s32Piii(\n"
	"	.param .u64 _Z10kernel_s32Piii_param_0,\n"
	"	.param .u32 _Z10kernel_s32Piii_param_1,\n"
	"	.param .u32 _Z10kernel_s32Piii_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<4>;\n"
	"	.reg .b64 	%rd<3>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_s32Piii_param_0];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_s32Piii_param_1];\n"
	"	ld.param.u32 	%r2, [_Z10kernel_s32Piii_param_2];\n"
	"	cvta.to.global.u64 	%rd2, %rd1;\n"
	"	shr.s32 	%r3, %r1, %r2;\n"
	"	st.global.u32 	[%rd2], %r3;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"	// .globl	_Z10kernel_s64Pxxi\n"
	".visible .entry _Z10kernel_s64Pxxi(\n"
	"	.param .u64 _Z10kernel_s64Pxxi_param_0,\n"
	"	.param .u64 _Z10kernel_s64Pxxi_param_1,\n"
	"	.param .u32 _Z10kernel_s64Pxxi_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<2>;\n"
	"	.reg .b64 	%rd<5>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_s64Pxxi_param_0];\n"
	"	ld.param.u64 	%rd2, [_Z10kernel_s64Pxxi_param_1];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_s64Pxxi_param_2];\n"
	"	cvta.to.global.u64 	%rd3, %rd1;\n"
	"	shr.s64 	%rd4, %rd2, %r1;\n"
	"	st.global.u64 	[%rd3], %rd4;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"	// .globl	_Z10kernel_u32Pjji\n"
	".visible .entry _Z10kernel_u32Pjji(\n"
	"	.param .u64 _Z10kernel_u32Pjji_param_0,\n"
	"	.param .u32 _Z10kernel_u32Pjji_param_1,\n"
	"	.param .u32 _Z10kernel_u32Pjji_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<4>;\n"
	"	.reg .b64 	%rd<3>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_u32Pjji_param_0];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_u32Pjji_param_1];\n"
	"	ld.param.u32 	%r2, [_Z10kernel_u32Pjji_param_2];\n"
	"	cvta.to.global.u64 	%rd2, %rd1;\n"
	"	shr.u32 	%r3, %r1, %r2;\n"
	"	st.global.u32 	[%rd2], %r3;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"	// .globl	_Z10kernel_u64Pyyi\n"
	".visible .entry _Z10kernel_u64Pyyi(\n"
	"	.param .u64 _Z10kernel_u64Pyyi_param_0,\n"
	"	.param .u64 _Z10kernel_u64Pyyi_param_1,\n"
	"	.param .u32 _Z10kernel_u64Pyyi_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<2>;\n"
	"	.reg .b64 	%rd<5>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_u64Pyyi_param_0];\n"
	"	ld.param.u64 	%rd2, [_Z10kernel_u64Pyyi_param_1];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_u64Pyyi_param_2];\n"
	"	cvta.to.global.u64 	%rd3, %rd1;\n"
	"	shr.u64 	%rd4, %rd2, %r1;\n"
	"	st.global.u64 	[%rd3], %rd4;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"\n"
	;
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, test_source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, name.c_str()));
	T output;
	CUdeviceptr devOutput;
	cu_assert(cuMemAlloc(&devOutput, sizeof(output)));
	void * params[] = {&devOutput, (void*)&input, (void*)&shiftValue};
	auto result = cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr);
	cu_assert(result);
	cu_assert(cuMemcpyDtoH(&output, devOutput, sizeof(output)));
	cu_assert(cuMemFree(devOutput));
	cu_assert(cuModuleUnload(modId));
	return output;
}