Пример #1
0
CAMLprim value spoc_cuda_debug_compile(value moduleSrc, value function_name, value gi){
	CAMLparam3(moduleSrc, function_name, gi);
	CUmodule module;
	CUfunction *kernel;
	char* functionN;
	char *ptx_source;
	const unsigned int jitNumOptions = 4;

	CUjit_option jitOptions[4];
	void *jitOptVals[4];
	int jitLogBufferSize;
	char *jitLogBuffer;
	int jitRegCount = 32;

	BLOCKING_CUDA_GET_CONTEXT;

	kernel = malloc(sizeof(CUfunction));
	functionN = String_val(function_name);
	ptx_source = String_val(moduleSrc);

	// set up size of compilation log buffer
	jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
	jitLogBufferSize = 1024;
	jitOptVals[0] = (void *)(size_t)jitLogBufferSize;

	// set up pointer to the compilation log buffer
	jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
	jitLogBuffer = malloc(sizeof(char)*jitLogBufferSize);
	jitOptVals[1] = jitLogBuffer;

	// set up pointer to set the Maximum # of registers for a particular kernel
	jitOptions[2] = CU_JIT_MAX_REGISTERS;
	jitOptVals[2] = (void *)(size_t)jitRegCount;

	jitOptions[3] = CU_JIT_TARGET_FROM_CUCONTEXT;
	//CU_JIT_TARGET;
//	jitOptVals[3] =  (void*)(uintptr_t)CU_TARGET_COMPUTE_10;


	cuda_error = (cuModuleLoadDataEx(&module, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals));
	if (cuda_error)
	  {
	    fprintf (stderr,"%s\n", jitLogBuffer);
	    fflush (stderr);
	  }
	cuda_error = (cuModuleGetFunction(kernel, module, functionN));
	if (cuda_error)
	  {
	    fprintf (stderr, "%s\n", jitLogBuffer);
	    fflush (stderr);
	  }
	BLOCKING_CUDA_RESTORE_CONTEXT;
	free(jitLogBuffer);
	CAMLreturn((value) kernel);
}
Пример #2
0
GpuCompilationContext::GpuCompilationContext(const void* image,
                                             const std::string& kernel_name,
                                             const int device_id,
                                             const void* cuda_mgr,
                                             unsigned int num_options,
                                             CUjit_option* options,
                                             void** option_vals)
    : module_(nullptr), kernel_(nullptr), device_id_(device_id), cuda_mgr_(cuda_mgr) {
  static_cast<const CudaMgr_Namespace::CudaMgr*>(cuda_mgr_)->setContext(device_id_);
  checkCudaErrors(cuModuleLoadDataEx(&module_, image, num_options, options, option_vals));
  CHECK(module_);
  checkCudaErrors(cuModuleGetFunction(&kernel_, module_, kernel_name.c_str()));
}
Пример #3
0
/*
 * returns the cubin = gpu machine code  form the PTX ISA assembly
 * setup JIT compilation options and perform compilation
 */
CUmodule * CudaCompiler::compilePTX(uchar * KernelPTXDump, GPU * gpu=0)
{
    // consider the default context is in run if gpu==0
    // TODO : make it mandatory
    if(gpu!=0) CUDCHK( cuCtxPushCurrent(gpu->context) );

    //CUDCHK( cuCtxSynchronize());

    // in this branch we use compilation with parameters
    const unsigned int jitNumOptions = 3;
    int jitLogBufferSize = 1024;
    int jitRegCount = 32;

    CUjit_option *  jitOptions   = new CUjit_option[jitNumOptions];
    void **         jitOptVals   = new void*[jitNumOptions];
    char *          jitLogBuffer = new char[jitLogBufferSize];

    jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;// set up size of compilation log buffer
    jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;// set up pointer to the compilation log buffer
    jitOptions[2] = CU_JIT_MAX_REGISTERS;  // set up pointer to set the Maximum # of registers for a particular kernel

    jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
    jitOptVals[1] = jitLogBuffer;
    jitOptVals[2] = (void *)(size_t)jitRegCount;

    // compile with set parameters
    qDebug()<<"> Compiling PTX module";
    CUmodule * cuModule = new CUmodule();

    CUDCHK( cuModuleLoadDataEx( cuModule, KernelPTXDump, jitNumOptions, jitOptions, (void **)jitOptVals) );

    qDebug()<< "PTX JIT log: \n [" <<  jitLogBuffer <<"]" ;

    delete [] jitOptions;
    delete [] jitOptVals;
    delete [] jitLogBuffer;

    CUDCHK( cuCtxPopCurrent(0));

    return cuModule;
}
Пример #4
0
CUresult CuContext::LoadModuleFilenameEx(const std::string& filename, 
	ModulePtr* ppModule, uint maxRegisters) {

	std::ifstream file(filename.c_str());
	if(!file.good()) return CUDA_ERROR_FILE_NOT_FOUND;

	std::string contents(std::istreambuf_iterator<char>(file),
		std::istreambuf_iterator<char>(0));

	ModulePtr module(new CuModule);

	CUjit_option options[1] = { CU_JIT_MAX_REGISTERS };
	uint values[1] = { maxRegisters };
	CUresult result = cuModuleLoadDataEx(&module->_module, contents.c_str(),
		1, options, (void**)values);
	HANDLE_RESULT();

	module->_context = this;
	ppModule->swap(module);
	return CUDA_SUCCESS;
}
Пример #5
0
/// main - Program entry point
int main(int argc, char** argv) {
  if (argc != 3) {
    printf("Usage: %s dataCount blockSize\n", argv[0]);
    exit(1);
  }

  CUdevice device;
  CUmodule cudaModule;
  CUcontext context;
  CUfunction function;
  CUlinkState linker;
  int devCount;

  // CUDA initialization
  checkCudaErrors(cuInit(0));
  checkCudaErrors(cuDeviceGetCount(&devCount));
  checkCudaErrors(cuDeviceGet(&device, 0));

  char name[128];
  checkCudaErrors(cuDeviceGetName(name, 128, device));
  std::cout << "Using CUDA Device [0]: " << name << "\n";

  int devMajor, devMinor;
  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
  std::cout << "Device Compute Capability: " << devMajor << "." << devMinor
            << "\n";
  if (devMajor < 2) {
    std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
    return 1;
  }

  std::ifstream t("kernel.ptx");
  if (!t.is_open()) {
    std::cerr << "kernel.ptx not found\n";
    return 1;
  }
  std::string str((std::istreambuf_iterator<char>(t)),
                  std::istreambuf_iterator<char>());

  // Create driver context
  checkCudaErrors(cuCtxCreate(&context, 0, device));

  // Create module for object
  checkCudaErrors(cuModuleLoadDataEx(&cudaModule, str.c_str(), 0, 0, 0));

  // Get kernel function
  checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));

  // Device data
  CUdeviceptr devBufferA;
  CUdeviceptr devBufferB;
  CUdeviceptr devBufferC;
  CUdeviceptr devBufferSMid;

  // Size
  unsigned dataCount = atoi(argv[1]);

  checkCudaErrors(cuMemAlloc(&devBufferA, sizeof(float) * dataCount));
  checkCudaErrors(cuMemAlloc(&devBufferB, sizeof(float) * dataCount));
  checkCudaErrors(cuMemAlloc(&devBufferC, sizeof(float) * dataCount));
  checkCudaErrors(cuMemAlloc(&devBufferSMid, sizeof(int) * dataCount));

  float* hostA = new float[dataCount];
  float* hostB = new float[dataCount];
  float* hostC = new float[dataCount];
  int* hostSMid = new int[dataCount];

  // Populate input
  for (unsigned i = 0; i != dataCount; ++i) {
    hostA[i] = (float)i;
    hostB[i] = (float)(2 * i);
    hostC[i] = 2.0f;
    hostSMid[i] = 0;
  }

  checkCudaErrors(
      cuMemcpyHtoD(devBufferA, &hostA[0], sizeof(float) * dataCount));
  checkCudaErrors(
      cuMemcpyHtoD(devBufferB, &hostB[0], sizeof(float) * dataCount));

  unsigned blockSizeX = atoi(argv[2]);
  unsigned blockSizeY = 1;
  unsigned blockSizeZ = 1;
  unsigned gridSizeX = (dataCount + blockSizeX - 1) / blockSizeX;
  unsigned gridSizeY = 1;
  unsigned gridSizeZ = 1;

  // Kernel parameters
  void* KernelParams[] = {&devBufferA, &devBufferB, &devBufferC,
                          &devBufferSMid};

  std::cout << "Launching kernel\n";

  // Kernel launch
  checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
                                 blockSizeX, blockSizeY, blockSizeZ, 0, NULL,
                                 KernelParams, NULL));

  // Retrieve device data
  checkCudaErrors(
      cuMemcpyDtoH(&hostC[0], devBufferC, sizeof(float) * dataCount));
  checkCudaErrors(
      cuMemcpyDtoH(&hostSMid[0], devBufferSMid, sizeof(int) * dataCount));

  std::cout << "Results:\n";
  std::cout << "SM " << hostSMid[0] << ":" << hostA[0] << " + " << hostB[0]
            << " = " << hostC[0] << "\n";
  for (unsigned i = 1; i != dataCount; i++) {
    if (hostSMid[i] != hostSMid[i - 1])
      std::cout << "SM " << hostSMid[i] << ":" << hostA[i] << " + " << hostB[i]
                << " = " << hostC[i] << "\n";
  }

  // Clean up after ourselves
  delete[] hostA;
  delete[] hostB;
  delete[] hostC;
  delete[] hostSMid;

  // Clean-up
  checkCudaErrors(cuMemFree(devBufferA));
  checkCudaErrors(cuMemFree(devBufferB));
  checkCudaErrors(cuMemFree(devBufferC));
  checkCudaErrors(cuMemFree(devBufferSMid));
  checkCudaErrors(cuModuleUnload(cudaModule));
  checkCudaErrors(cuCtxDestroy(context));

  return 0;
}
Пример #6
0
void RTC_VIT(unsigned int number, const char* GPU_kernel, HMMER_PROFILE *hmm,
	     unsigned int *seq_1D, unsigned int *offset, unsigned int *seq_len,
	     unsigned int *iLen, unsigned int sum, double *pVal,
	     int warp, int maxreg, dim3 GRID, dim3 BLOCK)
{	
	/*********************************/
	/* 0. Prepare for cuda drive API */
	/*********************************/
	CUdevice cuDevice;
	CUcontext context;
	CUmodule module;
	CUfunction kernel;

	checkCudaErrors(cuInit(0));
	checkCudaErrors(cuDeviceGet(&cuDevice, 0));
	checkCudaErrors(cuCtxCreate(&context, 0, cuDevice)); 

	/*********************************************/
	/* 1. Device Property: fixed based on Device */
	/*********************************************/

	/****************************************/
	/* 2. Device Memory Allocation and copy */
	/****************************************/
	StopWatchInterface *timer;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

   	/* Driver API pointers */
	CUdeviceptr d_seq, d_offset, d_len, d_len_6r, mat_v, trans, score;

	/* Allocation */
	checkCudaErrors(cuMemAlloc(&d_seq, sum * sizeof(unsigned int)));							/* copy 1D database */
	checkCudaErrors(cuMemAlloc(&d_offset, number * sizeof(unsigned int)));						/* copy offset of each seq*/
	checkCudaErrors(cuMemAlloc(&d_len, number * sizeof(unsigned int)));							/* copy raw length of each seq */
	checkCudaErrors(cuMemAlloc(&d_len_6r, number * sizeof(unsigned int)));						/* copy padding length of each seq */
	checkCudaErrors(cuMemAlloc(&mat_v, hmm->vitQ * PROTEIN_TYPE * sizeof(__32int__)));		/* striped EMISSION score */
	checkCudaErrors(cuMemAlloc(&trans, hmm->vitQ * TRANS_TYPE * sizeof(__32int__)));		/* striped transition score */
	checkCudaErrors(cuMemAlloc(&score, number * sizeof(double)));								/* P-Value as output */

	/* H to D copy */
	checkCudaErrors(cuMemcpyHtoD(d_seq, seq_1D, sum * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(d_offset, offset, number * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(d_len, seq_len, number * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(d_len_6r, iLen, number * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(mat_v, hmm->vit_vec, hmm->vitQ * PROTEIN_TYPE * sizeof(__32int__)));
	checkCudaErrors(cuMemcpyHtoD(trans, hmm->trans_vec, hmm->vitQ * TRANS_TYPE * sizeof(__32int__)));
		
	sdkStopTimer(&timer);
    printf("Alloc & H to D Copy time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /********************************************************/
	/* 3. Runtime compilation, Generate PTX and Load module */
	/********************************************************/
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

	/* NVRTC create handle */
	nvrtcProgram prog;
	NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog,				// prog
															 GPU_kernel,		// buffer
															 NULL,				// name: CUDA program name. name can be NULL; “default_program” is used when it is NULL.
															 0,					// numHeaders (I put header file path with -I later)
															 NULL,				// headers' content
															 NULL));			// include full name of headers

	/* 1. eliminate const through pointer */
    char *a = NULL;
    const char *b = a;
    const char **opts = &b;

    /* 2. elminate const through reference */
    //char a_value = 'c';
    //char* aa = &a_value;
    //const char *&bb = aa;		// no way with const
    //const char**&ref = aa;	// no way

    /* Dynamic Options */
    char **test_char = new char*[8];

    test_char[0] = new char[__INCLUDE__.length() + strlen("simd_def.h") + 1];					// #include simd_def.h
	strcpy(test_char[0], get_option(__INCLUDE__, "simd_def.h").c_str());

    test_char[1] = new char[__INCLUDE__.length() + strlen("simd_functions.h") + 1];				// #include simd_functions.h
    strcpy(test_char[1], get_option(__INCLUDE__, "simd_functions.h").c_str());

    test_char[2] = new char[__RDC__.length() + __F__.length() + 1];								// -rdc=false
    strcpy(test_char[2], get_option(__RDC__, __F__).c_str());

    test_char[3] = new char[__ARCH__.length() + __CC35__.length() + 1];							// -arch=compute_35
    strcpy(test_char[3], get_option(__ARCH__, __CC35__).c_str());

    test_char[4] = new char[__MAXREG__.length() + int2str(maxreg).length() + 1];				// -maxrregcount = <?>
    strcpy(test_char[4], get_option(__MAXREG__, int2str(maxreg)).c_str());

    test_char[5] = new char[__RIB__.length() + int2str(warp).length() + 1];						// #define RIB <?> : warps per block
    strcpy(test_char[5], get_option(__RIB__, int2str(warp)).c_str());

    test_char[6] = new char[__SIZE__.length() + int2str((int)force_local_size).length() + 1];	// #define SIZE 40
    strcpy(test_char[6], get_option(__SIZE__, int2str((int)force_local_size)).c_str());

    test_char[7] = new char[__Q__.length() + int2str(hmm->vitQ).length() + 1];					// #define Q <?>
    strcpy(test_char[7], get_option(__Q__, int2str(hmm->vitQ)).c_str());

    /* 1. change const char** through pointer */
    //char* **test = const_cast<char** *>(&opts);
    //*test = test_char;

    /* 2. change const char** through reference */
    char** &test_ref = const_cast<char** &>(opts);
    test_ref = test_char;

    /* NVRTC compile */
	NVRTC_SAFE_CALL("nvrtcCompileProgram", nvrtcCompileProgram(prog,	// prog
															   8,		// numOptions
															   opts));	// options

	sdkStopTimer(&timer);
    printf("nvrtc Creat and Compile: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

	//======================================================================================//
	// /* dump log */																		//	
    // size_t logSize;																		//
    // NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize));	//
    // char *log = (char *) malloc(sizeof(char) * logSize + 1);								//
    // NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));				//
    // log[logSize] = '\x0';																//
    // std::cerr << "\n compilation log ---\n";												//
    // std::cerr << log;																	//
    // std::cerr << "\n end log ---\n";														//
    // free(log);																			//
	//======================================================================================//
	
	/* NVRTC fetch PTX */
	sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

	size_t ptxsize;
	NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxsize));
	char *ptx = new char[ptxsize];
	NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx));
	NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog));	// destroy program instance

	/* Launch PTX by driver API */
	checkCudaErrors(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
	checkCudaErrors(cuModuleGetFunction(&kernel, module, "KERNEL"));	// return the handle of function, name is the same as real kernel function

	sdkStopTimer(&timer);
    printf("Compile & Load time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /**************************************/
	/* 4. GPU kernel launch by driver API */
	/**************************************/
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    
    cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1);
   /* parameters for kernel funciton */
	void *arr[] = { &d_seq, &number, &d_offset,
					&score, &d_len, &d_len_6r, &mat_v, &trans, 
					&(hmm->base_vs), &(hmm->E_lm), &(hmm->ddbound_vs),
					&(hmm->scale_w), &(hmm->vitQ), &(hmm->MU[1]), &(hmm->LAMBDA[1])};

	/* launch kernel */
        checkCudaErrors(cuLaunchKernel(	kernel,
								  	GRID.x, GRID.y, GRID.z,		/* grid dim */
									BLOCK.x, BLOCK.y, BLOCK.z,	/* block dim */
									0,0,						/* SMEM, stream */
									&arr[0],					/* kernel params */
									0));						/* extra opts */

	/* wait for kernel finish */
	checkCudaErrors(cuCtxSynchronize());			/* block for a context's task to complete */

	sdkStopTimer(&timer);
    printf("Kernel time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /*****************************************/
    /* 5. P-value return and post-processing */
    /*****************************************/
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    checkCudaErrors(cuMemcpyDtoH(pVal, score, number * sizeof(double)));

   	sdkStopTimer(&timer);
    printf("D to H copy time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /* count the number of seqs pass */
	unsigned long pass_vit = 0;			/* # of seqs pass vit */

	for (int i = 0; i < number; i++)
	{
		if (pVal[i] <= F2)
			pass_vit++;
	}

	printf("|			PASS VIT 			\n");
	printf("|	 ALL	|	 FWD	|\n");
	printf("|	%d  	|	%d  	|\n",  pass_vit, pass_vit);

	/************************/
	/* 6. clean the context */
	/************************/
    checkCudaErrors(cuDevicePrimaryCtxReset(cuDevice));		/* reset */
	checkCudaErrors(cuCtxSynchronize());					/* block for a context's task to complete */
}