CUresult cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch, CUdeviceptr d_dstARGB, size_t nDestPitch, uint32 width, uint32 height, CUfunction fpFunc, CUstream streamID) { CUresult status; // Each thread will output 2 pixels at a time. The grid size width is half // as large because of this dim3 block(32,16,1); dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1); #if CUDA_VERSION >= 4000 // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) void *args[] = { &d_srcNV12, &nSourcePitch, &d_dstARGB, &nDestPitch, &width, &height }; // new CUDA 4.0 Driver API Kernel launch call status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, streamID, args, NULL); #else // This is the older Driver API launch method from CUDA (V1.0 to V3.2) cutilDrvSafeCall(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1)); int offset = 0; // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers // device pointer for Source Surface cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_srcNV12, sizeof(d_srcNV12))); offset += sizeof(d_srcNV12); // set the Source pitch cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch))); offset += sizeof(nSourcePitch); // device pointer for Destination Surface cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_dstARGB, sizeof(d_dstARGB))); offset += sizeof(d_dstARGB); // set the Destination Pitch cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nDestPitch, sizeof(nDestPitch))); offset += sizeof(nDestPitch); // set the width of the image ALIGN_OFFSET(offset, __alignof(width)); cutilDrvSafeCall(cuParamSeti(fpFunc, offset, width)); offset += sizeof(width); // set the height of the image ALIGN_OFFSET(offset, __alignof(height)); cutilDrvSafeCall(cuParamSeti(fpFunc, offset, height)); offset += sizeof(height); cutilDrvSafeCall(cuParamSetSize(fpFunc, offset)); // Launching the kernel, we need to pass in the grid dimensions status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID); #endif if (CUDA_SUCCESS != status) { fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %08x, retval = %d\n", (unsigned int)fpFunc, status); return status; } return status; }
int main( int argc, char** argv) { uint num_threads; uint num_blocks, block_size; uint length; uint nBytes; int *list; int status, verbose, c, i, j, logBlocks; int read_stdin; struct timeval start_time, end_time; unsigned long total_time; CUdevice hDevice; CUcontext hContext; CUmodule hModule; CUfunction bitonicBlockFn; CUfunction mergeBlocksFn; CUdeviceptr pDeviceArrayA; CUdeviceptr pDeviceArrayB; status = SUCCESS; verbose = 0; read_stdin = FALSE; length = 0; while ((c = getopt (argc, argv, "dip:vO")) != -1) { switch (c) { case 'd': verbose |= GROSS_DEBUG; break; case 'i': read_stdin = TRUE; case 'O': verbose |= OUTPUT; break; case 'p': length = 1 << atoi(optarg); break; case 'v': verbose |= DEBUG; break; case '?': default: print_usage(); return FAILURE; } } if ( read_stdin == TRUE ) { /* Read sequence of integers from stdin */ list = (int*) malloc (INIT_INPUT_SIZE * sizeof(int) ); length = readIntegers(list, INIT_INPUT_SIZE); } else if ( length > 0 ) { list = (int*) malloc (length * sizeof(int) ); randomInts(list, length); } else if (optind >= argc) { /* No size was given */ print_usage(); return FAILURE; } else { /* Generate our own integers */ length = atoi(argv[optind]); list = (int*) malloc (length * sizeof(int) ); randomInts(list, length); } /* * Phase 1: * There will be one thread for each element to be sorted. Each * block will perform bitonic sort on MAX_THREADS_PER_BLOCK elements. */ /* Initialize sizes */ num_threads = _min(length, MAX_THREADS_PER_BLOCK ); num_blocks = (length-1) / MAX_THREADS_PER_BLOCK + 1; nBytes = length * sizeof(int); if (verbose & DEBUG) printf("Initializing GPU.\n"); /* Start timing */ gettimeofday(&start_time, NULL); /* Initialize GPU */ cutilDrvSafeCall( cuInit(0) ); cutilDrvSafeCall( cuDeviceGet(&hDevice, 0) ); cutilDrvSafeCall( cuCtxCreate(&hContext, 0, hDevice) ); cutilDrvSafeCall( cuModuleLoad(&hModule, MODULE_FILE) ); cutilDrvSafeCall( cuModuleGetFunction(&bitonicBlockFn, hModule, BITONIC_BLOCK_FN) ); /* Allocate memory on the device */ cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayA, nBytes) ); cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayB, nBytes) ); cutilDrvSafeCall( cuMemcpyHtoD(pDeviceArrayA, list, nBytes) ); cutilDrvSafeCall( cuFuncSetBlockShape(bitonicBlockFn, num_threads, 1, 1)); cutilDrvSafeCall( cuParamSeti(bitonicBlockFn, 0, pDeviceArrayA) ); cutilDrvSafeCall( cuParamSetSize(bitonicBlockFn, 4) ); /* Execute the kernel on the GPU */ if ( verbose & DEBUG ) printf("Launching bitonic sort kernel with %d blocks and %d threads per block.\n", num_blocks, num_threads); cutilDrvSafeCall( cuLaunchGrid(bitonicBlockFn, num_blocks, 1) ); /* * Phase 2: * At this point each block is a sorted list. Now it's time to merge them. */ /* TODO This should go away after development */ if ( verbose & GROSS_DEBUG ) { cuMemcpyDtoH(list, pDeviceArrayA, nBytes); for (i=0; i<num_blocks; ++i) { printf("### Block %d:\n", i); for (j=0; j<num_threads; ++j) { printf("%d\n", list[i*num_threads + j]); } } } i=0; /* Do we need to merge blocks? */ if ( num_blocks > 1 ) { /* There will be Log_2(num_blocks) merge steps. */ logBlocks = 0; for (i=1; i<num_blocks; i *= 2) ++logBlocks; if ( verbose & DEBUG ) printf("There will be %d merge steps.\n", logBlocks); block_size = num_threads; /* How big the blocks were in the last grid launch. */ num_threads = num_blocks >> 1; /* Start with blocks/2 threads */ num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK + 1; cutilDrvSafeCall( cuModuleGetFunction(&mergeBlocksFn, hModule, MERGE_BLOCKS_FN) ); cuParamSeti(mergeBlocksFn, 4, block_size); cuParamSetSize(mergeBlocksFn, 16); for (i=0; i < logBlocks; ++i) { cuFuncSetBlockShape(mergeBlocksFn, num_threads, 1, 1); cuParamSeti(mergeBlocksFn, 0, i); /* set merge level */ /* Merging uses a source array and destination array, the gpu has 2 arrays allocated * so we swap which is the source and which is the destination for each iteration. */ if ( i%2 == 0 ) { cuParamSeti(mergeBlocksFn, 8, pDeviceArrayA); cuParamSeti(mergeBlocksFn, 12, pDeviceArrayB); } else { cuParamSeti(mergeBlocksFn, 8, pDeviceArrayB); cuParamSeti(mergeBlocksFn, 12, pDeviceArrayA); } if ( verbose & DEBUG ) { printf("Launching block merge kernel with %d blocks and %d threads per block\n", num_blocks, num_threads/num_blocks); } cutilDrvSafeCall( cuLaunchGrid(mergeBlocksFn, num_blocks, 1) ); num_threads = num_threads >> 1; num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK + 1; } }