C++ (Cpp) cuLaunchKernel示例

示例#1

0

显示文件

文件： microbench.cpp 项目： EgoIncarnate/maxas

int main(int argc, char* argv[])
{
	//int iTest = 2896;
	//while (iTest < 0x7fff)
	//{
	//	int iResult = iTest * iTest;
	//	float fTest = (float)iTest;
	//	int fResult = (int)(fTest * fTest);

	//	printf("i*i:%08x f*f:%08x\n", iResult, fResult);

	//	iTest += 0x0800;
	//}
	//exit(0);
	
	char deviceName[32];
	int devCount, ordinal, major, minor;
	CUdevice  hDevice;

	// Initialize the Driver API and find a device
	CUDA_CHECK( cuInit(0) );
	CUDA_CHECK( cuDeviceGetCount(&devCount) );
	for (ordinal = 0; ordinal < devCount; ordinal++)
	{
		CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) );
		CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) );
		CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) );
		CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) );
		if (major >= 5 && minor >= 2)
		{
			printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor);
			break;
		}
	}
	if (ordinal == devCount)
	{
		printf("No compute 5.0 device found, exiting.\n");
		exit(EXIT_FAILURE);
	}

	// First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing
	int internalTiming = 1;
	if (argc > 1)
		internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0;

	// Second command line arg is the number of blocks
	int blocks = 1;
	if (argc > 2)
		blocks = atoi(argv[2]);
	if (blocks < 1)
		blocks = 1;

	// Third command line arg is the number of threads
	int threads = 128;
	if (argc > 3)
		threads = atoi(argv[3]);
	if (threads > 1024 || threads < 32)
		threads = 128;
	threads &= -32;

	// Forth command line arg:
	double fops = 1.0;
	int lanes = 1;
	if (argc > 4)
	{
		if (internalTiming)
		{
			// The number of lanes to print for each warp
			lanes = atoi(argv[4]);
			if (lanes > 32 || lanes < 1)
				lanes = 1;
		}
		else
			// The number of floating point operations in a full kernel launch
			fops = atof(argv[4]);
	}

	// Fifth command line arg is the repeat count for benchmarking
	int repeat = 1;
	if (argc > 5)
		repeat = atoi(argv[5]);
	if (repeat > 1000 || repeat < 1)
		repeat = 1;

	// threads = total number of threads
	size_t size = sizeof(int) * threads * blocks;

	// Setup our input and output buffers
	int* dataIn  = (int*)malloc(size);
	int* dataOut = (int*)malloc(size);
	int* clocks  = (int*)malloc(size);
	memset(dataIn, 0, size);

	CUmodule hModule;
	CUfunction hKernel;
	CUevent hStart, hStop;
	CUdeviceptr devIn, devOut, devClocks;

	// Init our context and device memory buffers
	CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) );
	CUDA_CHECK( cuMemAlloc(&devIn, size) );
	CUDA_CHECK( cuMemAlloc(&devOut, size) );
	CUDA_CHECK( cuMemAlloc(&devClocks, size) );
	CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) );
	CUDA_CHECK( cuMemsetD8(devOut, 0, size) );
	CUDA_CHECK( cuMemsetD8(devClocks, 0, size) );

	CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) );
	CUDA_CHECK( cuEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );

	// Load our kernel
	CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") );
	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") );

	// Setup the params
	void* params[] = { &devOut, &devClocks, &devIn };
	float ms = 0;

	// Warm up the clock (unless under nsight)
	if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER
		for (int i = 0; i < repeat; i++)
			CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );

	// Launch the kernel
	CUDA_CHECK( cuEventRecord(hStart, NULL) );
	//CUDA_CHECK( cuProfilerStart() ); 
	CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
	//CUDA_CHECK( cuProfilerStop() ); 
	CUDA_CHECK( cuEventRecord(hStop, NULL) );
	CUDA_CHECK( cuEventSynchronize(hStop) );
	CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) );
	
	//CUDA_CHECK( cuCtxSynchronize() );

	// Get back our results from each kernel
	CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) );
	CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) );

	// Cleanup and shutdown of cuda
	CUDA_CHECK( cuEventDestroy(hStart) );
	CUDA_CHECK( cuEventDestroy(hStop) );
	CUDA_CHECK( cuModuleUnload(hModule) );
	CUDA_CHECK( cuMemFree(devIn) );
	CUDA_CHECK( cuMemFree(devOut) );
	CUDA_CHECK( cuMemFree(devClocks) );
	CUDA_CHECK( cuCtxDestroy(hContext) );
	hContext = 0;

	// When using just one block, print out the internal timing data
	if (internalTiming)
	{
		int count = 0, total = 0, min = 999999, max = 0;
		
		int* clocks_p  = clocks;
		int* dataOut_p = dataOut;
		
		// Loop over and print results
		for (int blk = 0; blk < blocks; blk++)
		{
			float *fDataOut = reinterpret_cast<float*>(dataOut_p);

			for(int tid = 0; tid < threads; tid += 32)
			{
				// Sometimes we want data on each thread, sometimes just one sample per warp is fine
				for (int lane = 0; lane < lanes; lane++)
					printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u

				count++;
				total += clocks_p[tid];
				if (clocks_p[tid] < min) min = clocks_p[tid];
				if (clocks_p[tid] > max) max = clocks_p[tid];
			}
			clocks_p  += threads;
			dataOut_p += threads;
		}
		printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max);
	}
	else
	{
		// For more than one block we're testing throughput and want external timing data
		printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0));
	}
	// And free up host memory
	free(dataIn); free(dataOut); free(clocks);

	return 0;
}

示例#2

0

显示文件

文件： GPUInterfaceCUDA.cpp 项目： beagle-dev/beagle-lib

void GPUInterface::LaunchKernelConcurrent(GPUFunction deviceFunction,
                                         Dim3Int block,
                                         Dim3Int grid,
                                         int streamIndex,
                                         int waitIndex,
                                         int parameterCountV,
                                         int totalParameterCount,
                                         ...) { // parameters
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::LaunchKernelConcurrent\n");
#endif

    SAFE_CUDA(cuCtxPushCurrent(cudaContext));

    void** params;
    GPUPtr* paramPtrs;
    unsigned int* paramInts;

    params = (void**)malloc(sizeof(void*) * totalParameterCount);
    paramPtrs = (GPUPtr*)malloc(sizeof(GPUPtr) * totalParameterCount);
    paramInts = (unsigned int*)malloc(sizeof(unsigned int) * totalParameterCount);

    va_list parameters;
    va_start(parameters, totalParameterCount);
    for(int i = 0; i < parameterCountV; i++) {
       paramPtrs[i] = (GPUPtr)(size_t)va_arg(parameters, GPUPtr);
       params[i] = (void*)&paramPtrs[i];
    }
    for(int i = parameterCountV; i < totalParameterCount; i++) {
       paramInts[i-parameterCountV] = va_arg(parameters, unsigned int);
       params[i] = (void*)&paramInts[i-parameterCountV];
    }

    va_end(parameters);

    if (streamIndex >= 0) {
        int streamIndexMod = streamIndex % numStreams;

        if (waitIndex >= 0) {
            int waitIndexMod = waitIndex % numStreams;
            SAFE_CUDA(cuStreamWaitEvent(cudaStreams[streamIndexMod], cudaEvents[waitIndexMod], 0));
        }

        SAFE_CUDA(cuLaunchKernel(deviceFunction, grid.x, grid.y, grid.z,
                                 block.x, block.y, block.z, 0,
                                 cudaStreams[streamIndexMod], params, NULL));

        SAFE_CUDA(cuEventRecord(cudaEvents[streamIndexMod], cudaStreams[streamIndexMod]));
    } else {
        SAFE_CUDA(cuLaunchKernel(deviceFunction, grid.x, grid.y, grid.z,
                                 block.x, block.y, block.z, 0,
                                 cudaStreams[0], params, NULL));
    }

    free(params);
    free(paramPtrs);
    free(paramInts);

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::LaunchKernelConcurrent\n");
#endif

}

示例#3

0

显示文件

文件： binomialOptions_gpu.cpp 项目： ajperalt/nvidia-cuda-7.0-samples

extern "C" void binomialOptionsGPU(
    real *callValue,
    TOptionData  *optionData,
    int optN,
    int argc,
    char **argv
)
{
    if (!moduleLoaded) {
      kernel_file = sdkFindFilePath("binomialOptions_kernel.cu", argv[0]);
      compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize);
      module = loadPTX(ptx, argc, argv);
      moduleLoaded = true;
    }

    __TOptionData h_OptionData[MAX_OPTIONS];

    for (int i = 0; i < optN; i++)
    {
        const real      T = optionData[i].T;
        const real      R = optionData[i].R;
        const real      V = optionData[i].V;

        const real     dt = T / (real)NUM_STEPS;
        const real    vDt = V * sqrt(dt);
        const real    rDt = R * dt;
        //Per-step interest and discount factors
        const real     If = exp(rDt);
        const real     Df = exp(-rDt);
        //Values and pseudoprobabilities of upward and downward moves
        const real      u = exp(vDt);
        const real      d = exp(-vDt);
        const real     pu = (If - d) / (u - d);
        const real     pd = (real)1.0 - pu;
        const real puByDf = pu * Df;
        const real pdByDf = pd * Df;

        h_OptionData[i].S      = (real)optionData[i].S;
        h_OptionData[i].X      = (real)optionData[i].X;
        h_OptionData[i].vDt    = (real)vDt;
        h_OptionData[i].puByDf = (real)puByDf;
        h_OptionData[i].pdByDf = (real)pdByDf;
    }

    CUfunction kernel_addr;
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "binomialOptionsKernel"));

    CUdeviceptr d_OptionData;
    checkCudaErrors(cuModuleGetGlobal(&d_OptionData, NULL, module, "d_OptionData"));
    checkCudaErrors(cuMemcpyHtoD(d_OptionData, h_OptionData, optN * sizeof(__TOptionData)));

    dim3 cudaBlockSize(128,1,1);
    dim3 cudaGridSize(optN, 1, 1);

    checkCudaErrors(cuLaunchKernel(kernel_addr,
                                            cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */
                                            cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
                                            0,0, /* shared mem, stream */
                                            NULL, /* arguments */
                                            0));

    checkCudaErrors(cuCtxSynchronize());

    CUdeviceptr d_CallValue;
    checkCudaErrors(cuModuleGetGlobal(&d_CallValue, NULL, module, "d_CallValue"));
    checkCudaErrors(cuMemcpyDtoH(callValue, d_CallValue, optN *sizeof(real)));
}

示例#4

0

显示文件

文件： RTC_VIT.cpp 项目： Super-Hippo/CUDAMPF

void RTC_VIT(unsigned int number, const char* GPU_kernel, HMMER_PROFILE *hmm,
	     unsigned int *seq_1D, unsigned int *offset, unsigned int *seq_len,
	     unsigned int *iLen, unsigned int sum, double *pVal,
	     int warp, int maxreg, dim3 GRID, dim3 BLOCK)
{	
	/*********************************/
	/* 0. Prepare for cuda drive API */
	/*********************************/
	CUdevice cuDevice;
	CUcontext context;
	CUmodule module;
	CUfunction kernel;

	checkCudaErrors(cuInit(0));
	checkCudaErrors(cuDeviceGet(&cuDevice, 0));
	checkCudaErrors(cuCtxCreate(&context, 0, cuDevice)); 

	/*********************************************/
	/* 1. Device Property: fixed based on Device */
	/*********************************************/

	/****************************************/
	/* 2. Device Memory Allocation and copy */
	/****************************************/
	StopWatchInterface *timer;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

   	/* Driver API pointers */
	CUdeviceptr d_seq, d_offset, d_len, d_len_6r, mat_v, trans, score;

	/* Allocation */
	checkCudaErrors(cuMemAlloc(&d_seq, sum * sizeof(unsigned int)));							/* copy 1D database */
	checkCudaErrors(cuMemAlloc(&d_offset, number * sizeof(unsigned int)));						/* copy offset of each seq*/
	checkCudaErrors(cuMemAlloc(&d_len, number * sizeof(unsigned int)));							/* copy raw length of each seq */
	checkCudaErrors(cuMemAlloc(&d_len_6r, number * sizeof(unsigned int)));						/* copy padding length of each seq */
	checkCudaErrors(cuMemAlloc(&mat_v, hmm->vitQ * PROTEIN_TYPE * sizeof(__32int__)));		/* striped EMISSION score */
	checkCudaErrors(cuMemAlloc(&trans, hmm->vitQ * TRANS_TYPE * sizeof(__32int__)));		/* striped transition score */
	checkCudaErrors(cuMemAlloc(&score, number * sizeof(double)));								/* P-Value as output */

	/* H to D copy */
	checkCudaErrors(cuMemcpyHtoD(d_seq, seq_1D, sum * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(d_offset, offset, number * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(d_len, seq_len, number * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(d_len_6r, iLen, number * sizeof(unsigned int)));
	checkCudaErrors(cuMemcpyHtoD(mat_v, hmm->vit_vec, hmm->vitQ * PROTEIN_TYPE * sizeof(__32int__)));
	checkCudaErrors(cuMemcpyHtoD(trans, hmm->trans_vec, hmm->vitQ * TRANS_TYPE * sizeof(__32int__)));
		
	sdkStopTimer(&timer);
    printf("Alloc & H to D Copy time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /********************************************************/
	/* 3. Runtime compilation, Generate PTX and Load module */
	/********************************************************/
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

	/* NVRTC create handle */
	nvrtcProgram prog;
	NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog,				// prog
															 GPU_kernel,		// buffer
															 NULL,				// name: CUDA program name. name can be NULL; “default_program” is used when it is NULL.
															 0,					// numHeaders (I put header file path with -I later)
															 NULL,				// headers' content
															 NULL));			// include full name of headers

	/* 1. eliminate const through pointer */
    char *a = NULL;
    const char *b = a;
    const char **opts = &b;

    /* 2. elminate const through reference */
    //char a_value = 'c';
    //char* aa = &a_value;
    //const char *&bb = aa;		// no way with const
    //const char**&ref = aa;	// no way

    /* Dynamic Options */
    char **test_char = new char*[8];

    test_char[0] = new char[__INCLUDE__.length() + strlen("simd_def.h") + 1];					// #include simd_def.h
	strcpy(test_char[0], get_option(__INCLUDE__, "simd_def.h").c_str());

    test_char[1] = new char[__INCLUDE__.length() + strlen("simd_functions.h") + 1];				// #include simd_functions.h
    strcpy(test_char[1], get_option(__INCLUDE__, "simd_functions.h").c_str());

    test_char[2] = new char[__RDC__.length() + __F__.length() + 1];								// -rdc=false
    strcpy(test_char[2], get_option(__RDC__, __F__).c_str());

    test_char[3] = new char[__ARCH__.length() + __CC35__.length() + 1];							// -arch=compute_35
    strcpy(test_char[3], get_option(__ARCH__, __CC35__).c_str());

    test_char[4] = new char[__MAXREG__.length() + int2str(maxreg).length() + 1];				// -maxrregcount = <?>
    strcpy(test_char[4], get_option(__MAXREG__, int2str(maxreg)).c_str());

    test_char[5] = new char[__RIB__.length() + int2str(warp).length() + 1];						// #define RIB <?> : warps per block
    strcpy(test_char[5], get_option(__RIB__, int2str(warp)).c_str());

    test_char[6] = new char[__SIZE__.length() + int2str((int)force_local_size).length() + 1];	// #define SIZE 40
    strcpy(test_char[6], get_option(__SIZE__, int2str((int)force_local_size)).c_str());

    test_char[7] = new char[__Q__.length() + int2str(hmm->vitQ).length() + 1];					// #define Q <?>
    strcpy(test_char[7], get_option(__Q__, int2str(hmm->vitQ)).c_str());

    /* 1. change const char** through pointer */
    //char* **test = const_cast<char** *>(&opts);
    //*test = test_char;

    /* 2. change const char** through reference */
    char** &test_ref = const_cast<char** &>(opts);
    test_ref = test_char;

    /* NVRTC compile */
	NVRTC_SAFE_CALL("nvrtcCompileProgram", nvrtcCompileProgram(prog,	// prog
															   8,		// numOptions
															   opts));	// options

	sdkStopTimer(&timer);
    printf("nvrtc Creat and Compile: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

	//======================================================================================//
	// /* dump log */																		//	
    // size_t logSize;																		//
    // NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize));	//
    // char *log = (char *) malloc(sizeof(char) * logSize + 1);								//
    // NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));				//
    // log[logSize] = '\x0';																//
    // std::cerr << "\n compilation log ---\n";												//
    // std::cerr << log;																	//
    // std::cerr << "\n end log ---\n";														//
    // free(log);																			//
	//======================================================================================//
	
	/* NVRTC fetch PTX */
	sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

	size_t ptxsize;
	NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxsize));
	char *ptx = new char[ptxsize];
	NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx));
	NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog));	// destroy program instance

	/* Launch PTX by driver API */
	checkCudaErrors(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
	checkCudaErrors(cuModuleGetFunction(&kernel, module, "KERNEL"));	// return the handle of function, name is the same as real kernel function

	sdkStopTimer(&timer);
    printf("Compile & Load time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /**************************************/
	/* 4. GPU kernel launch by driver API */
	/**************************************/
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    
    cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1);
   /* parameters for kernel funciton */
	void *arr[] = { &d_seq, &number, &d_offset,
					&score, &d_len, &d_len_6r, &mat_v, &trans, 
					&(hmm->base_vs), &(hmm->E_lm), &(hmm->ddbound_vs),
					&(hmm->scale_w), &(hmm->vitQ), &(hmm->MU[1]), &(hmm->LAMBDA[1])};

	/* launch kernel */
        checkCudaErrors(cuLaunchKernel(	kernel,
								  	GRID.x, GRID.y, GRID.z,		/* grid dim */
									BLOCK.x, BLOCK.y, BLOCK.z,	/* block dim */
									0,0,						/* SMEM, stream */
									&arr[0],					/* kernel params */
									0));						/* extra opts */

	/* wait for kernel finish */
	checkCudaErrors(cuCtxSynchronize());			/* block for a context's task to complete */

	sdkStopTimer(&timer);
    printf("Kernel time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /*****************************************/
    /* 5. P-value return and post-processing */
    /*****************************************/
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    checkCudaErrors(cuMemcpyDtoH(pVal, score, number * sizeof(double)));

   	sdkStopTimer(&timer);
    printf("D to H copy time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    /* count the number of seqs pass */
	unsigned long pass_vit = 0;			/* # of seqs pass vit */

	for (int i = 0; i < number; i++)
	{
		if (pVal[i] <= F2)
			pass_vit++;
	}

	printf("|			PASS VIT 			\n");
	printf("|	 ALL	|	 FWD	|\n");
	printf("|	%d  	|	%d  	|\n",  pass_vit, pass_vit);

	/************************/
	/* 6. clean the context */
	/************************/
    checkCudaErrors(cuDevicePrimaryCtxReset(cuDevice));		/* reset */
	checkCudaErrors(cuCtxSynchronize());					/* block for a context's task to complete */
}

示例#5

0

显示文件

文件： clock.cpp 项目： zhanglx13/Automated_particle_filter_design_system

/////////////////////////////////////////////////////
// Main program
/////////////////////////////////////////////////////
int main(int argc, char **argv)
{
  typedef long clock_t;

  unsigned int num_warps = NUM_BLOCKS * NUM_THREADS / 32;

  // we allocate two timer for each warp
  clock_t *timer = (clock_t*)malloc(num_warps * sizeof(clock_t) * 2);


  // Initialize CUDA driver
  checkCudaErrors(cuInit(0));

  // Get number of devices supporting CUDA
  int deviceCount = 0;
  checkCudaErrors(cuDeviceGetCount(&deviceCount));
  if (deviceCount == 0) {
    printf("There is no device supporting CUDA.\n");
    exit (0);
  }

  // Get handle for device 0
  CUdevice cuDevice;
  checkCudaErrors(cuDeviceGet(&cuDevice, 0));

  // Create context
  CUcontext cuContext;
  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

  // JIT compile the kernel from PTX and get the handle
  CUfunction kernel_addr;
  CUmodule cuModule;
  ptxJIT(&cuModule, &kernel_addr, "clock.ptx", "timeDummy");

  // Allocate timer on device
  CUdeviceptr dtimer;
  checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * num_warps * 2));
  dim3 cudaBlockSize(NUM_THREADS, 1, 1);
  dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
  void *kernel_param[] = {(void*)&dtimer};
  checkCudaErrors(cuLaunchKernel(kernel_addr,
                                 cudaGridSize.x, cudaGridSize.y, cudaGridSize.z,
                                 cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z,
                                 0, 0, &kernel_param[0], 0));

  // Sync the context
  checkCudaErrors(cuCtxSynchronize());
  // copy result back to host
  checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * num_warps * 2));

  // Compute the execution time of the kernel
  clock_t minStart = timer[0];
  clock_t maxEnd = timer[num_warps];
  for (int i = 1 ; i < num_warps ; i ++){
    minStart = timer[i] < minStart ? timer[i] : minStart;
    maxEnd = timer[num_warps + i] > maxEnd ? timer[num_warps + i] : maxEnd;
  }

  printf("Total clocks = %Lf\n", (long double)(maxEnd - minStart));
  printf("Number of warps = %u\n", num_warps);

  // Clean up
  free(timer);
  checkCudaErrors(cuMemFree(dtimer));
  checkCudaErrors(cuModuleUnload(cuModule));
  checkCudaErrors(cuCtxDestroy(cuContext));

  return EXIT_SUCCESS;
}

示例#6

0

显示文件

文件： lib-73.c 项目： pjump/gcc

int
main (int argc, char **argv)
{
    CUdevice dev;
    CUfunction delay;
    CUmodule module;
    CUresult r;
    const int N = 10;
    int i;
    CUstream streams[N];
    unsigned long *a, *d_a, dticks;
    int nbytes;
    float dtime;
    void *kargs[2];
    int clkrate;
    int devnum, nprocs;

    acc_init (acc_device_nvidia);

    devnum = acc_get_device_num (acc_device_nvidia);

    r = cuDeviceGet (&dev, devnum);
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuDeviceGet failed: %d\n", r);
        abort ();
    }

    r =
        cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                              dev);
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
        abort ();
    }

    r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
        abort ();
    }

    r = cuModuleLoad (&module, "subr.ptx");
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuModuleLoad failed: %d\n", r);
        abort ();
    }

    r = cuModuleGetFunction (&delay, module, "delay");
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
        abort ();
    }

    nbytes = nprocs * sizeof (unsigned long);

    dtime = 200.0;

    dticks = (unsigned long) (dtime * clkrate);

    a = (unsigned long *) malloc (nbytes);
    d_a = (unsigned long *) acc_malloc (nbytes);

    acc_map_data (a, d_a, nbytes);

    kargs[0] = (void *) &d_a;
    kargs[1] = (void *) &dticks;

    for (i = 0; i < N; i++)
    {
        streams[i] = (CUstream) acc_get_cuda_stream (i);
        if (streams[i] != NULL)
            abort ();

        r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
        if (r != CUDA_SUCCESS)
        {
            fprintf (stderr, "cuStreamCreate failed: %d\n", r);
            abort ();
        }

        if (!acc_set_cuda_stream (i, streams[i]))
            abort ();
    }

    for (i = 0; i < N; i++)
    {
        r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
        if (r != CUDA_SUCCESS)
        {
            fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
            abort ();
        }

    }

    if (acc_async_test_all () != 0)
    {
        fprintf (stderr, "asynchronous operation not running\n");
        abort ();
    }

    sleep ((int) (dtime / 1000.0f) + 1);

    if (acc_async_test_all () != 1)
    {
        fprintf (stderr, "asynchronous operation not running\n");
        abort ();
    }

    acc_unmap_data (a);

    free (a);
    acc_free (d_a);

    acc_shutdown (acc_device_nvidia);

    exit (0);
}

示例#7

0

显示文件

文件： lud_cuda.c 项目： Constellation/gdev-bench

int lud_launch(CUmodule mod, CUdeviceptr m, int matrix_dim)
{
	int i = 0;
	int bdx, bdy, gdx, gdy;
	int shared_size;
	float *m_debug = (float*)malloc(matrix_dim * matrix_dim * sizeof(float));
	CUfunction f_diagonal, f_perimeter, f_internal;
	CUresult res;

	/* get functions. */
	res = cuModuleGetFunction(&f_diagonal, mod, "_Z12lud_diagonalPfii");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction(f_diagonal) failed\n");
		return 0;
	}
	res = cuModuleGetFunction(&f_perimeter, mod, "_Z13lud_perimeterPfii");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction(f_perimeter) failed\n");
		return 0;
	}
	res = cuModuleGetFunction(&f_internal, mod, "_Z12lud_internalPfii");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction(f_internal) failed\n");
		return 0;
	}
	
	for (i = 0; i < matrix_dim - BLOCK_SIZE; i += BLOCK_SIZE) {
		void* param[] = {(void*) &m, (void*) &matrix_dim, (void*) &i};
		/* diagonal */
		gdx = 1;
		gdy = 1;
		bdx = BLOCK_SIZE;
		bdy = 1;
		shared_size = BLOCK_SIZE * BLOCK_SIZE * sizeof(float);
		res = cuLaunchKernel(f_diagonal, gdx, gdy, 1, bdx, bdy, 1, shared_size,
							 0, (void**) param, NULL);
        if (res != CUDA_SUCCESS) {
            printf("cuLaunchKernel(f_diagonal) failed: res = %u\n", res);
            return 0;
        }

		/* perimeter */
		gdx = (matrix_dim - i) / BLOCK_SIZE - 1;
		gdy = 1;
		bdx = BLOCK_SIZE * 2;
		bdy = 1;
		shared_size = BLOCK_SIZE * BLOCK_SIZE * sizeof(float) * 3;
		res = cuLaunchKernel(f_perimeter, gdx, gdy, 1, bdx, bdy, 1, shared_size,
							 0, (void**) param, NULL);
        if (res != CUDA_SUCCESS) {
            printf("cuLaunchKernel(f_perimeter) failed: res = %u\n", res);
            return 0;
        }

		/* internal */
		gdx = (matrix_dim - i) / BLOCK_SIZE - 1;
		gdy = (matrix_dim - i) / BLOCK_SIZE - 1;
		bdx = BLOCK_SIZE;
		bdy = BLOCK_SIZE;
		shared_size = BLOCK_SIZE * BLOCK_SIZE * sizeof(float) * 2;
		res = cuLaunchKernel(f_internal, gdx, gdy, 1, bdx, bdy, 1, shared_size,
							 0, (void**) param, NULL);
        if (res != CUDA_SUCCESS) {
            printf("cuLaunchKernel(internal) failed: res = %u\n", res);
            return 0;
        }
	}

	void* param[] = {(void*) &m, (void*) &matrix_dim, (void*) &i};
	/* diagonal */
	gdx = 1;
	gdy = 1;
	res = cuLaunchKernel(f_diagonal, gdx, gdy, 1, bdx, bdy, 1, shared_size,
						 0, (void**) param, NULL);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(f_diagonal) failed: res = %u\n", res);
		return 0;
	}
	
	free(m_debug);

	return 0;
}

示例#8

0

显示文件

文件： lib-82.c 项目： 0day-ci/gcc

int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay2;
  CUmodule module;
  CUresult r;
  int N;
  int i;
  CUstream *streams;
  unsigned long **a, **d_a, *tid, ticks;
  int nbytes;
  void *kargs[3];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay2, module, "delay2");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = sizeof (int);

  ticks = (unsigned long) (200.0 * clkrate);

  N = nprocs;

  streams = (CUstream *) malloc (N * sizeof (void *));

  a = (unsigned long **) malloc (N * sizeof (unsigned long *));
  d_a = (unsigned long **) malloc (N * sizeof (unsigned long *));
  tid = (unsigned long *) malloc (N * sizeof (unsigned long));

  for (i = 0; i < N; i++)
    {
      a[i] = (unsigned long *) malloc (sizeof (unsigned long));
      *a[i] = N;
      d_a[i] = (unsigned long *) acc_malloc (nbytes);
      tid[i] = i;

      acc_map_data (a[i], d_a[i], nbytes);

      streams[i] = (CUstream) acc_get_cuda_stream (i);
      if (streams[i] != NULL)
        abort ();

      r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
      if (r != CUDA_SUCCESS)
        {
          fprintf (stderr, "cuStreamCreate failed: %d\n", r);
          abort ();
        }

       if (!acc_set_cuda_stream (i, streams[i]))
        abort ();
    }

  for (i = 0; i < N; i++)
    {
      kargs[0] = (void *) &d_a[i];
      kargs[1] = (void *) &ticks;
      kargs[2] = (void *) &tid[i];

      r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
      if (r != CUDA_SUCCESS)
	{
	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
	  abort ();
	}

      ticks = (unsigned long) (50.0 * clkrate);
    }

  acc_wait_all_async (0);

  for (i = 0; i < N; i++)
    {
      acc_copyout (a[i], nbytes);
      if (*a[i] != i)
	abort ();
    }

  free (streams);

  for (i = 0; i < N; i++)
    {
      free (a[i]);
    }

  free (a);
  free (d_a);
  free (tid);

  acc_shutdown (acc_device_nvidia);

  exit (0);
}

示例#9

0

显示文件

文件： basic_shift.cpp 项目： sebsgit/gemu

T run_function(const std::string& name, const T input, const int shiftValue) {
	const std::string test_source =
	"//\n"
	"// Generated by NVIDIA NVVM Compiler\n"
	"//\n"
	"// Compiler Build ID: CL-19856038\n"
	"// Cuda compilation tools, release 7.5, V7.5.17\n"
	"// Based on LLVM 3.4svn\n"
	"//\n"
	"\n"
	".version 4.3\n"
	".target sm_20\n"
	".address_size 64\n"
	"\n"
	"	// .globl	_Z10kernel_s32Piii\n"
	"\n"
	".visible .entry _Z10kernel_s32Piii(\n"
	"	.param .u64 _Z10kernel_s32Piii_param_0,\n"
	"	.param .u32 _Z10kernel_s32Piii_param_1,\n"
	"	.param .u32 _Z10kernel_s32Piii_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<4>;\n"
	"	.reg .b64 	%rd<3>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_s32Piii_param_0];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_s32Piii_param_1];\n"
	"	ld.param.u32 	%r2, [_Z10kernel_s32Piii_param_2];\n"
	"	cvta.to.global.u64 	%rd2, %rd1;\n"
	"	shr.s32 	%r3, %r1, %r2;\n"
	"	st.global.u32 	[%rd2], %r3;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"	// .globl	_Z10kernel_s64Pxxi\n"
	".visible .entry _Z10kernel_s64Pxxi(\n"
	"	.param .u64 _Z10kernel_s64Pxxi_param_0,\n"
	"	.param .u64 _Z10kernel_s64Pxxi_param_1,\n"
	"	.param .u32 _Z10kernel_s64Pxxi_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<2>;\n"
	"	.reg .b64 	%rd<5>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_s64Pxxi_param_0];\n"
	"	ld.param.u64 	%rd2, [_Z10kernel_s64Pxxi_param_1];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_s64Pxxi_param_2];\n"
	"	cvta.to.global.u64 	%rd3, %rd1;\n"
	"	shr.s64 	%rd4, %rd2, %r1;\n"
	"	st.global.u64 	[%rd3], %rd4;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"	// .globl	_Z10kernel_u32Pjji\n"
	".visible .entry _Z10kernel_u32Pjji(\n"
	"	.param .u64 _Z10kernel_u32Pjji_param_0,\n"
	"	.param .u32 _Z10kernel_u32Pjji_param_1,\n"
	"	.param .u32 _Z10kernel_u32Pjji_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<4>;\n"
	"	.reg .b64 	%rd<3>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_u32Pjji_param_0];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_u32Pjji_param_1];\n"
	"	ld.param.u32 	%r2, [_Z10kernel_u32Pjji_param_2];\n"
	"	cvta.to.global.u64 	%rd2, %rd1;\n"
	"	shr.u32 	%r3, %r1, %r2;\n"
	"	st.global.u32 	[%rd2], %r3;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"	// .globl	_Z10kernel_u64Pyyi\n"
	".visible .entry _Z10kernel_u64Pyyi(\n"
	"	.param .u64 _Z10kernel_u64Pyyi_param_0,\n"
	"	.param .u64 _Z10kernel_u64Pyyi_param_1,\n"
	"	.param .u32 _Z10kernel_u64Pyyi_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<2>;\n"
	"	.reg .b64 	%rd<5>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_u64Pyyi_param_0];\n"
	"	ld.param.u64 	%rd2, [_Z10kernel_u64Pyyi_param_1];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_u64Pyyi_param_2];\n"
	"	cvta.to.global.u64 	%rd3, %rd1;\n"
	"	shr.u64 	%rd4, %rd2, %r1;\n"
	"	st.global.u64 	[%rd3], %rd4;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"\n"
	;
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, test_source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, name.c_str()));
	T output;
	CUdeviceptr devOutput;
	cu_assert(cuMemAlloc(&devOutput, sizeof(output)));
	void * params[] = {&devOutput, (void*)&input, (void*)&shiftValue};
	auto result = cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr);
	cu_assert(result);
	cu_assert(cuMemcpyDtoH(&output, devOutput, sizeof(output)));
	cu_assert(cuMemFree(devOutput));
	cu_assert(cuModuleUnload(modId));
	return output;
}

示例#10

0

显示文件

文件： cudaProcessFrame.cpp 项目： ffmpeg-build-win/nvidia_video_sdk

CUresult  cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch,
                                  CUdeviceptr d_dstARGB, size_t nDestPitch,
                                  uint32 width,          uint32 height,
                                  CUfunction fpFunc, CUstream streamID)
{
    CUresult status;
    // Each thread will output 2 pixels at a time.  The grid size width is half
    // as large because of this
    dim3 block(32,16,1);
    dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1);

#if __CUDA_API_VERSION >= 4000
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
    void *args[] = { &d_srcNV12, &nSourcePitch,
                     &d_dstARGB, &nDestPitch,
                     &width, &height
                   };

    // new CUDA 4.0 Driver API Kernel launch call
    status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z,
                            block.x, block.y, block.z,
                            0, streamID,
                            args, NULL);
#else
    // This is the older Driver API launch method from CUDA (V1.0 to V3.2)
    checkCudaErrors(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1));
    int offset = 0;

    // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers

    // device pointer for Source Surface
    checkCudaErrors(cuParamSetv(fpFunc, offset, &d_srcNV12,    sizeof(d_srcNV12)));
    offset += sizeof(d_srcNV12);

    // set the Source pitch
    checkCudaErrors(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch)));
    offset += sizeof(nSourcePitch);

    // device pointer for Destination Surface
    checkCudaErrors(cuParamSetv(fpFunc, offset, &d_dstARGB,    sizeof(d_dstARGB)));
    offset += sizeof(d_dstARGB);

    //  set the Destination Pitch
    checkCudaErrors(cuParamSetv(fpFunc, offset, &nDestPitch,   sizeof(nDestPitch)));
    offset += sizeof(nDestPitch);

    // set the width of the image
    ALIGN_OFFSET(offset, __alignof(width));
    checkCudaErrors(cuParamSeti(fpFunc, offset, width));
    offset += sizeof(width);

    // set the height of the image
    ALIGN_OFFSET(offset, __alignof(height));
    checkCudaErrors(cuParamSeti(fpFunc, offset, height));
    offset += sizeof(height);

    checkCudaErrors(cuParamSetSize(fpFunc, offset));

    // Launching the kernel, we need to pass in the grid dimensions
    CUresult status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID);
#endif

    if (CUDA_SUCCESS != status)
    {
        fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %p, retval = %d\n", fpFunc, status);
        return status;
    }

    return status;
}