Example #1
0
static void *call_compiler(const char *src, size_t len, const char *arch_arg,
                           size_t *bin_len, char **log, size_t *log_len,
                           int *ret) {
  nvrtcProgram prog;
  void *buf = NULL;
  size_t buflen;
  const char *opts[4] = {
    "-arch", ""
    , "-G", "-lineinfo"
  };
  nvrtcResult err, err2;

  opts[1] = arch_arg;

  err = nvrtcCreateProgram(&prog, src, NULL, 0, NULL, NULL);
  if (err != NVRTC_SUCCESS) FAIL(NULL, GA_SYS_ERROR);

  err = nvrtcCompileProgram(prog,
#ifdef DEBUG
                            4,
#else
                            2,
#endif
                            opts);
  if (log != NULL) {
    err2 = nvrtcGetProgramLogSize(prog, &buflen);
    if (err2 != NVRTC_SUCCESS) goto end2;
    buf = malloc(buflen);
    if (buf == NULL) goto end2;
    err2 = nvrtcGetProgramLog(prog, (char *)buf);
    if (err2 != NVRTC_SUCCESS) goto end2;
    if (log_len != NULL) *log_len = buflen;
    *log = (char *)buf;
    buf = NULL;
  }
end2:
  if (err != NVRTC_SUCCESS) goto end;

  err = nvrtcGetPTXSize(prog, &buflen);
  if (err != NVRTC_SUCCESS) goto end;

  buf = malloc(buflen);
  if (buf == NULL) {
    nvrtcDestroyProgram(&prog);
    FAIL(NULL, GA_MEMORY_ERROR);
  }

  err = nvrtcGetPTX(prog, (char *)buf);
  if (err != NVRTC_SUCCESS) goto end;

  *bin_len = buflen;

end:
  nvrtcDestroyProgram(&prog);
  if (err != NVRTC_SUCCESS) {
    free(buf);
    FAIL(NULL, GA_SYS_ERROR);
  }
  return buf;
}
Example #2
0
static CUmodule
build_kernel_source(const char *source_file, long target_capability)
{
	char		   *source;
	int				link_dev_runtime;
	nvrtcProgram	program;
	nvrtcResult		rc;
	char			arch_buf[128];
	const char	   *options[10];
	int				opt_index = 0;
	int				build_failure = 0;
	char		   *build_log;
	size_t			build_log_len;
	char		   *ptx_image;
	size_t			ptx_image_len;
	void		   *bin_image;
	size_t			bin_image_len;
	CUmodule		cuda_module;
	CUresult		cuda_rc;

	source = load_kernel_source(source_file, &link_dev_runtime);
	rc = nvrtcCreateProgram(&program,
							source,
							NULL,
							0,
							NULL,
							NULL);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcCreateProgram");

	/*
	 * Put command line options as cuda_program.c doing
	 */
	options[opt_index++] = "-I " CUDA_INCLUDE_PATH;
	snprintf(arch_buf, sizeof(arch_buf),
			 "--gpu-architecture=compute_%ld", target_capability);
	options[opt_index++] = arch_buf;
#ifdef PGSTROM_DEBUG
	options[opt_index++] = "--device-debug";
	options[opt_index++] = "--generate-line-info";
#endif
	options[opt_index++] = "--use_fast_math";
	if (link_dev_runtime)
		options[opt_index++] = "--relocatable-device-code=true";

	/*
	 * Kick runtime compiler
	 */
	rc = nvrtcCompileProgram(program, opt_index, options);
	if (rc != NVRTC_SUCCESS)
	{
		if (rc == NVRTC_ERROR_COMPILATION)
			build_failure = 1;
		else
			nvrtc_error(rc, "nvrtcCompileProgram");
	}

	/*
	 * Print build log
	 */
	rc = nvrtcGetProgramLogSize(program, &build_log_len);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetProgramLogSize");
	build_log = malloc(build_log_len + 1);
	if (!build_log)
	{
		fputs("out of memory", stderr);
		exit(1);
	}
	rc = nvrtcGetProgramLog(program, build_log);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetProgramLog");

	if (build_log_len > 1)
		printf("build log:\n%s\n", build_log);
	if (build_failure)
		exit(1);

	/*
	 * Get PTX Image
	 */
	rc = nvrtcGetPTXSize(program, &ptx_image_len);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetPTXSize");
	ptx_image = malloc(ptx_image_len + 1);
	if (!ptx_image)
	{
		fputs("out of memory", stderr);
		exit(1);
	}
	rc = nvrtcGetPTX(program, ptx_image);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetPTX");
	ptx_image[ptx_image_len] = '\0';

	/*
	 * Link device runtime if needed
	 */
	if (link_dev_runtime)
	{
		link_device_libraries(ptx_image, ptx_image_len,
							  &bin_image, &bin_image_len,
							  target_capability);
	}
	else
	{
		bin_image = ptx_image;
		bin_image_len = ptx_image_len;
	}

	cuda_rc = cuModuleLoadData(&cuda_module, bin_image);
	if (cuda_rc != CUDA_SUCCESS)
		cuda_error(rc, "cuModuleLoadData");
	return cuda_module;
}
Example #3
0
void RTC_VIT(unsigned int number, const char* GPU_kernel, HMMER_PROFILE *hmm,
	     unsigned int *seq_1D, unsigned int *offset, unsigned int *seq_len,
	     unsigned int *iLen, unsigned int sum, double *pVal,
	     int warp, int maxreg, dim3 GRID, dim3 BLOCK)
{	
	/*********************************/
	/* 0. Prepare for cuda drive API */
	/*********************************/
	CUdevice cuDevice;
	CUcontext context;
	CUmodule module;
	CUfunction kernel;

	checkCudaErrors(cuInit(0));
	checkCudaErrors(cuDeviceGet(&cuDevice, 0));
	checkCudaErrors(cuCtxCreate(&context, 0, cuDevice)); 

	/*********************************************/
	/* 1. Device Property: fixed based on Device */
	/*********************************************/

	/****************************************/
	/* 2. Device Memory Allocation and copy */
	/****************************************/
	StopWatchInterface *timer;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

   	/* Driver API pointers */
	CUdeviceptr d_seq, d_offset, d_len, d_len_6r, mat_v, trans, score;

	/* Allocation */
	checkCudaErrors(cuMemAlloc(&d_seq, sum * sizeof(unsigned int)));							/* copy 1D database */
	checkCudaErrors(cuMemAlloc(&d_offset, number * sizeof(unsigned int)));						/* copy offset of each seq*/
	checkCudaErrors(cuMemAlloc(&d_len, number * sizeof(unsigned int)));							/* copy raw length of each seq */
	checkCudaErrors(cuMemAlloc(&d_len_6r, number * sizeof(unsigned int)));						/* copy padding length of each seq */
	checkCudaErrors(cuMemAlloc(&mat_v, hmm->vitQ * PROTEIN_TYPE * sizeof(__32int__)));		/* striped EMISSION score */
	checkCudaErrors(cuMemAlloc(&trans, hmm->vitQ * TRANS_TYPE * sizeof(__32int__)));		/* striped transition score */
	checkCudaErrors(cuMemAlloc(&score, number * sizeof(double)));								/* P-Value as output */

	/* H to D copy */
	checkCudaErrors(cuMemcpyHtoD(d_seq, seq_1D, sum * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(d_offset, offset, number * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(d_len, seq_len, number * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(d_len_6r, iLen, number * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(mat_v, hmm->vit_vec, hmm->vitQ * PROTEIN_TYPE * sizeof(__32int__)));
	checkCudaErrors(cuMemcpyHtoD(trans, hmm->trans_vec, hmm->vitQ * TRANS_TYPE * sizeof(__32int__)));
		
	sdkStopTimer(&timer);
    printf("Alloc & H to D Copy time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /********************************************************/
	/* 3. Runtime compilation, Generate PTX and Load module */
	/********************************************************/
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

	/* NVRTC create handle */
	nvrtcProgram prog;
	NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog,				// prog
															 GPU_kernel,		// buffer
															 NULL,				// name: CUDA program name. name can be NULL; “default_program” is used when it is NULL.
															 0,					// numHeaders (I put header file path with -I later)
															 NULL,				// headers' content
															 NULL));			// include full name of headers

	/* 1. eliminate const through pointer */
    char *a = NULL;
    const char *b = a;
    const char **opts = &b;

    /* 2. elminate const through reference */
    //char a_value = 'c';
    //char* aa = &a_value;
    //const char *&bb = aa;		// no way with const
    //const char**&ref = aa;	// no way

    /* Dynamic Options */
    char **test_char = new char*[8];

    test_char[0] = new char[__INCLUDE__.length() + strlen("simd_def.h") + 1];					// #include simd_def.h
	strcpy(test_char[0], get_option(__INCLUDE__, "simd_def.h").c_str());

    test_char[1] = new char[__INCLUDE__.length() + strlen("simd_functions.h") + 1];				// #include simd_functions.h
    strcpy(test_char[1], get_option(__INCLUDE__, "simd_functions.h").c_str());

    test_char[2] = new char[__RDC__.length() + __F__.length() + 1];								// -rdc=false
    strcpy(test_char[2], get_option(__RDC__, __F__).c_str());

    test_char[3] = new char[__ARCH__.length() + __CC35__.length() + 1];							// -arch=compute_35
    strcpy(test_char[3], get_option(__ARCH__, __CC35__).c_str());

    test_char[4] = new char[__MAXREG__.length() + int2str(maxreg).length() + 1];				// -maxrregcount = <?>
    strcpy(test_char[4], get_option(__MAXREG__, int2str(maxreg)).c_str());

    test_char[5] = new char[__RIB__.length() + int2str(warp).length() + 1];						// #define RIB <?> : warps per block
    strcpy(test_char[5], get_option(__RIB__, int2str(warp)).c_str());

    test_char[6] = new char[__SIZE__.length() + int2str((int)force_local_size).length() + 1];	// #define SIZE 40
    strcpy(test_char[6], get_option(__SIZE__, int2str((int)force_local_size)).c_str());

    test_char[7] = new char[__Q__.length() + int2str(hmm->vitQ).length() + 1];					// #define Q <?>
    strcpy(test_char[7], get_option(__Q__, int2str(hmm->vitQ)).c_str());

    /* 1. change const char** through pointer */
    //char* **test = const_cast<char** *>(&opts);
    //*test = test_char;

    /* 2. change const char** through reference */
    char** &test_ref = const_cast<char** &>(opts);
    test_ref = test_char;

    /* NVRTC compile */
	NVRTC_SAFE_CALL("nvrtcCompileProgram", nvrtcCompileProgram(prog,	// prog
															   8,		// numOptions
															   opts));	// options

	sdkStopTimer(&timer);
    printf("nvrtc Creat and Compile: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

	//======================================================================================//
	// /* dump log */																		//	
    // size_t logSize;																		//
    // NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize));	//
    // char *log = (char *) malloc(sizeof(char) * logSize + 1);								//
    // NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));				//
    // log[logSize] = '\x0';																//
    // std::cerr << "\n compilation log ---\n";												//
    // std::cerr << log;																	//
    // std::cerr << "\n end log ---\n";														//
    // free(log);																			//
	//======================================================================================//
	
	/* NVRTC fetch PTX */
	sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

	size_t ptxsize;
	NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxsize));
	char *ptx = new char[ptxsize];
	NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx));
	NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog));	// destroy program instance

	/* Launch PTX by driver API */
	checkCudaErrors(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
	checkCudaErrors(cuModuleGetFunction(&kernel, module, "KERNEL"));	// return the handle of function, name is the same as real kernel function

	sdkStopTimer(&timer);
    printf("Compile & Load time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /**************************************/
	/* 4. GPU kernel launch by driver API */
	/**************************************/
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    
    cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1);
   /* parameters for kernel funciton */
	void *arr[] = { &d_seq, &number, &d_offset,
					&score, &d_len, &d_len_6r, &mat_v, &trans, 
					&(hmm->base_vs), &(hmm->E_lm), &(hmm->ddbound_vs),
					&(hmm->scale_w), &(hmm->vitQ), &(hmm->MU[1]), &(hmm->LAMBDA[1])};

	/* launch kernel */
        checkCudaErrors(cuLaunchKernel(	kernel,
								  	GRID.x, GRID.y, GRID.z,		/* grid dim */
									BLOCK.x, BLOCK.y, BLOCK.z,	/* block dim */
									0,0,						/* SMEM, stream */
									&arr[0],					/* kernel params */
									0));						/* extra opts */

	/* wait for kernel finish */
	checkCudaErrors(cuCtxSynchronize());			/* block for a context's task to complete */

	sdkStopTimer(&timer);
    printf("Kernel time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /*****************************************/
    /* 5. P-value return and post-processing */
    /*****************************************/
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    checkCudaErrors(cuMemcpyDtoH(pVal, score, number * sizeof(double)));

   	sdkStopTimer(&timer);
    printf("D to H copy time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /* count the number of seqs pass */
	unsigned long pass_vit = 0;			/* # of seqs pass vit */

	for (int i = 0; i < number; i++)
	{
		if (pVal[i] <= F2)
			pass_vit++;
	}

	printf("|			PASS VIT 			\n");
	printf("|	 ALL	|	 FWD	|\n");
	printf("|	%d  	|	%d  	|\n",  pass_vit, pass_vit);

	/************************/
	/* 6. clean the context */
	/************************/
    checkCudaErrors(cuDevicePrimaryCtxReset(cuDevice));		/* reset */
	checkCudaErrors(cuCtxSynchronize());					/* block for a context's task to complete */
}