Ejemplo n.º 1
	void mem_zero(device_memory& mem)
		memset((void*)mem.data_pointer, 0, mem.memory_size());

		cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
Ejemplo n.º 2
ImageGL::clear(unsigned char nClearColor)
    // Can only be cleared if surface is a CUDA resource

    int nFrames = bVsync_ ? 3 : 1;        
    size_t       imagePitch;
    CUdeviceptr  pImageData;

    for (int field_num=0; field_num < nFrames; field_num++)
        map(&pImageData, &imagePitch, field_num);
        // clear the surface to solid white
        checkCudaErrors(cuMemsetD8(pImageData, nClearColor, nTexWidth_*nTexHeight_* Bpp()));
Ejemplo n.º 3
void swanMemset( void *ptr, unsigned char b, size_t len ) {
	CUresult  err;
	if( len == 0 ) { return; }

	if( (0 == len % 16) && b==0 ) {
			block_config_t grid, block;
			int threads;
//			if( __CUDA_ARCH__ == 110 ) {
				threads = 128;
//			} else {
//				threads = CUDA_THREAD_MAX;
//			}

			swanDecompose( &grid, &block, len/(sizeof(uint4)), threads );

			if( grid.x < 65535 ) {
				k_swan_fast_fill( grid, block, 0, (uint4*) ptr, len/(sizeof(uint4) )  );
			else {
				// the region to be zeroed is too large for the simple-minded fill kernel 
				// fall-back to something dumb
				err = cuMemsetD32( PTR_TO_CUDEVPTR(ptr), 0, len/4 );
	else if( 0 == len % 4 ) {
//			printf("SWAN: Warning: swanMemset using D32\n");
			unsigned word = 0;
			word = b;
			word <<=8;
			word |= b;
			word = word | (word<<16);
			err = cuMemsetD32( PTR_TO_CUDEVPTR(ptr), word, len/4 );
	else {
		printf("SWAN: Warning: swanMemset using D8\n");
		err = cuMemsetD8( PTR_TO_CUDEVPTR(ptr), b, len );
	if ( err != CUDA_SUCCESS ) {
		error("swanMemset failed\n" );
Ejemplo n.º 4
int main(int argc, char* argv[])
	//int iTest = 2896;
	//while (iTest < 0x7fff)
	//	int iResult = iTest * iTest;
	//	float fTest = (float)iTest;
	//	int fResult = (int)(fTest * fTest);

	//	printf("i*i:%08x f*f:%08x\n", iResult, fResult);

	//	iTest += 0x0800;
	char deviceName[32];
	int devCount, ordinal, major, minor;
	CUdevice  hDevice;

	// Initialize the Driver API and find a device
	CUDA_CHECK( cuInit(0) );
	CUDA_CHECK( cuDeviceGetCount(&devCount) );
	for (ordinal = 0; ordinal < devCount; ordinal++)
		CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) );
		CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) );
		if (major >= 5 && minor >= 2)
			printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor);
	if (ordinal == devCount)
		printf("No compute 5.0 device found, exiting.\n");

	// First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing
	int internalTiming = 1;
	if (argc > 1)
		internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0;

	// Second command line arg is the number of blocks
	int blocks = 1;
	if (argc > 2)
		blocks = atoi(argv[2]);
	if (blocks < 1)
		blocks = 1;

	// Third command line arg is the number of threads
	int threads = 128;
	if (argc > 3)
		threads = atoi(argv[3]);
	if (threads > 1024 || threads < 32)
		threads = 128;
	threads &= -32;

	// Forth command line arg:
	double fops = 1.0;
	int lanes = 1;
	if (argc > 4)
		if (internalTiming)
			// The number of lanes to print for each warp
			lanes = atoi(argv[4]);
			if (lanes > 32 || lanes < 1)
				lanes = 1;
			// The number of floating point operations in a full kernel launch
			fops = atof(argv[4]);

	// Fifth command line arg is the repeat count for benchmarking
	int repeat = 1;
	if (argc > 5)
		repeat = atoi(argv[5]);
	if (repeat > 1000 || repeat < 1)
		repeat = 1;

	// threads = total number of threads
	size_t size = sizeof(int) * threads * blocks;

	// Setup our input and output buffers
	int* dataIn  = (int*)malloc(size);
	int* dataOut = (int*)malloc(size);
	int* clocks  = (int*)malloc(size);
	memset(dataIn, 0, size);

	CUmodule hModule;
	CUfunction hKernel;
	CUevent hStart, hStop;
	CUdeviceptr devIn, devOut, devClocks;

	// Init our context and device memory buffers
	CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) );
	CUDA_CHECK( cuMemAlloc(&devIn, size) );
	CUDA_CHECK( cuMemAlloc(&devOut, size) );
	CUDA_CHECK( cuMemAlloc(&devClocks, size) );
	CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) );
	CUDA_CHECK( cuMemsetD8(devOut, 0, size) );
	CUDA_CHECK( cuMemsetD8(devClocks, 0, size) );

	CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) );
	CUDA_CHECK( cuEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );

	// Load our kernel
	CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") );
	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") );

	// Setup the params
	void* params[] = { &devOut, &devClocks, &devIn };
	float ms = 0;

	// Warm up the clock (unless under nsight)
		for (int i = 0; i < repeat; i++)
			CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );

	// Launch the kernel
	CUDA_CHECK( cuEventRecord(hStart, NULL) );
	//CUDA_CHECK( cuProfilerStart() ); 
	CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
	//CUDA_CHECK( cuProfilerStop() ); 
	CUDA_CHECK( cuEventRecord(hStop, NULL) );
	CUDA_CHECK( cuEventSynchronize(hStop) );
	CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) );
	//CUDA_CHECK( cuCtxSynchronize() );

	// Get back our results from each kernel
	CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) );
	CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) );

	// Cleanup and shutdown of cuda
	CUDA_CHECK( cuEventDestroy(hStart) );
	CUDA_CHECK( cuEventDestroy(hStop) );
	CUDA_CHECK( cuModuleUnload(hModule) );
	CUDA_CHECK( cuMemFree(devIn) );
	CUDA_CHECK( cuMemFree(devOut) );
	CUDA_CHECK( cuMemFree(devClocks) );
	CUDA_CHECK( cuCtxDestroy(hContext) );
	hContext = 0;

	// When using just one block, print out the internal timing data
	if (internalTiming)
		int count = 0, total = 0, min = 999999, max = 0;
		int* clocks_p  = clocks;
		int* dataOut_p = dataOut;
		// Loop over and print results
		for (int blk = 0; blk < blocks; blk++)
			float *fDataOut = reinterpret_cast<float*>(dataOut_p);

			for(int tid = 0; tid < threads; tid += 32)
				// Sometimes we want data on each thread, sometimes just one sample per warp is fine
				for (int lane = 0; lane < lanes; lane++)
					printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u

				total += clocks_p[tid];
				if (clocks_p[tid] < min) min = clocks_p[tid];
				if (clocks_p[tid] > max) max = clocks_p[tid];
			clocks_p  += threads;
			dataOut_p += threads;
		printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max);
		// For more than one block we're testing throughput and want external timing data
		printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0));
	// And free up host memory
	free(dataIn); free(dataOut); free(clocks);

	return 0;