void mem_zero(device_memory& mem) { memset((void*)mem.data_pointer, 0, mem.memory_size()); cuda_push_context(); cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())) cuda_pop_context(); }
void ImageGL::clear(unsigned char nClearColor) { // Can only be cleared if surface is a CUDA resource assert(bIsCudaResource_); int nFrames = bVsync_ ? 3 : 1; size_t imagePitch; CUdeviceptr pImageData; for (int field_num=0; field_num < nFrames; field_num++) { map(&pImageData, &imagePitch, field_num); // clear the surface to solid white checkCudaErrors(cuMemsetD8(pImageData, nClearColor, nTexWidth_*nTexHeight_* Bpp())); unmap(field_num); } }
void swanMemset( void *ptr, unsigned char b, size_t len ) { CUresult err; if( len == 0 ) { return; } if( (0 == len % 16) && b==0 ) { block_config_t grid, block; int threads; // if( __CUDA_ARCH__ == 110 ) { threads = 128; // } else { // threads = CUDA_THREAD_MAX; // } swanDecompose( &grid, &block, len/(sizeof(uint4)), threads ); if( grid.x < 65535 ) { k_swan_fast_fill( grid, block, 0, (uint4*) ptr, len/(sizeof(uint4) ) ); } else { // the region to be zeroed is too large for the simple-minded fill kernel // fall-back to something dumb err = cuMemsetD32( PTR_TO_CUDEVPTR(ptr), 0, len/4 ); } return; } else if( 0 == len % 4 ) { // printf("SWAN: Warning: swanMemset using D32\n"); unsigned word = 0; word = b; word <<=8; word |= b; word = word | (word<<16); err = cuMemsetD32( PTR_TO_CUDEVPTR(ptr), word, len/4 ); } else { printf("SWAN: Warning: swanMemset using D8\n"); err = cuMemsetD8( PTR_TO_CUDEVPTR(ptr), b, len ); } if ( err != CUDA_SUCCESS ) { error("swanMemset failed\n" ); } }
int main(int argc, char* argv[]) { //int iTest = 2896; //while (iTest < 0x7fff) //{ // int iResult = iTest * iTest; // float fTest = (float)iTest; // int fResult = (int)(fTest * fTest); // printf("i*i:%08x f*f:%08x\n", iResult, fResult); // iTest += 0x0800; //} //exit(0); char deviceName[32]; int devCount, ordinal, major, minor; CUdevice hDevice; // Initialize the Driver API and find a device CUDA_CHECK( cuInit(0) ); CUDA_CHECK( cuDeviceGetCount(&devCount) ); for (ordinal = 0; ordinal < devCount; ordinal++) { CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); if (major >= 5 && minor >= 2) { printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); break; } } if (ordinal == devCount) { printf("No compute 5.0 device found, exiting.\n"); exit(EXIT_FAILURE); } // First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing int internalTiming = 1; if (argc > 1) internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0; // Second command line arg is the number of blocks int blocks = 1; if (argc > 2) blocks = atoi(argv[2]); if (blocks < 1) blocks = 1; // Third command line arg is the number of threads int threads = 128; if (argc > 3) threads = atoi(argv[3]); if (threads > 1024 || threads < 32) threads = 128; threads &= -32; // Forth command line arg: double fops = 1.0; int lanes = 1; if (argc > 4) { if (internalTiming) { // The number of lanes to print for each warp lanes = atoi(argv[4]); if (lanes > 32 || lanes < 1) lanes = 1; } else // The number of floating point operations in a full kernel launch fops = atof(argv[4]); } // Fifth command line arg is the repeat count for benchmarking int repeat = 1; if (argc > 5) repeat = atoi(argv[5]); if (repeat > 1000 || repeat < 1) repeat = 1; // threads = total number of threads size_t size = sizeof(int) * threads * blocks; // Setup our input and output buffers int* dataIn = (int*)malloc(size); int* dataOut = (int*)malloc(size); int* clocks = (int*)malloc(size); memset(dataIn, 0, size); CUmodule hModule; CUfunction hKernel; CUevent hStart, hStop; CUdeviceptr devIn, devOut, devClocks; // Init our context and device memory buffers CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); CUDA_CHECK( cuMemAlloc(&devIn, size) ); CUDA_CHECK( cuMemAlloc(&devOut, size) ); CUDA_CHECK( cuMemAlloc(&devClocks, size) ); CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) ); CUDA_CHECK( cuMemsetD8(devOut, 0, size) ); CUDA_CHECK( cuMemsetD8(devClocks, 0, size) ); CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); // Load our kernel CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") ); CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") ); // Setup the params void* params[] = { &devOut, &devClocks, &devIn }; float ms = 0; // Warm up the clock (unless under nsight) if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER for (int i = 0; i < repeat; i++) CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); // Launch the kernel CUDA_CHECK( cuEventRecord(hStart, NULL) ); //CUDA_CHECK( cuProfilerStart() ); CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); //CUDA_CHECK( cuProfilerStop() ); CUDA_CHECK( cuEventRecord(hStop, NULL) ); CUDA_CHECK( cuEventSynchronize(hStop) ); CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); //CUDA_CHECK( cuCtxSynchronize() ); // Get back our results from each kernel CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) ); CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) ); // Cleanup and shutdown of cuda CUDA_CHECK( cuEventDestroy(hStart) ); CUDA_CHECK( cuEventDestroy(hStop) ); CUDA_CHECK( cuModuleUnload(hModule) ); CUDA_CHECK( cuMemFree(devIn) ); CUDA_CHECK( cuMemFree(devOut) ); CUDA_CHECK( cuMemFree(devClocks) ); CUDA_CHECK( cuCtxDestroy(hContext) ); hContext = 0; // When using just one block, print out the internal timing data if (internalTiming) { int count = 0, total = 0, min = 999999, max = 0; int* clocks_p = clocks; int* dataOut_p = dataOut; // Loop over and print results for (int blk = 0; blk < blocks; blk++) { float *fDataOut = reinterpret_cast<float*>(dataOut_p); for(int tid = 0; tid < threads; tid += 32) { // Sometimes we want data on each thread, sometimes just one sample per warp is fine for (int lane = 0; lane < lanes; lane++) printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u count++; total += clocks_p[tid]; if (clocks_p[tid] < min) min = clocks_p[tid]; if (clocks_p[tid] > max) max = clocks_p[tid]; } clocks_p += threads; dataOut_p += threads; } printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max); } else { // For more than one block we're testing throughput and want external timing data printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0)); } // And free up host memory free(dataIn); free(dataOut); free(clocks); return 0; }