CUresult  cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch,
                                  CUdeviceptr d_dstARGB, size_t nDestPitch,
                                  uint32 width,          uint32 height,
                                  CUfunction fpFunc, CUstream streamID)
{
    CUresult status;
    // Each thread will output 2 pixels at a time.  The grid size width is half
    // as large because of this
    dim3 block(32,16,1);
    dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1);

#if CUDA_VERSION >= 4000
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
    void *args[] = { &d_srcNV12, &nSourcePitch,
                     &d_dstARGB, &nDestPitch,
                     &width, &height
                   };

    // new CUDA 4.0 Driver API Kernel launch call
    status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z,
                            block.x, block.y, block.z,
                            0, streamID,
                            args, NULL);
#else
    // This is the older Driver API launch method from CUDA (V1.0 to V3.2)
    cutilDrvSafeCall(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1));
    int offset = 0;

    // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers

    // device pointer for Source Surface
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_srcNV12,    sizeof(d_srcNV12)));
    offset += sizeof(d_srcNV12);

    // set the Source pitch
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch)));
    offset += sizeof(nSourcePitch);

    // device pointer for Destination Surface
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_dstARGB,    sizeof(d_dstARGB)));
    offset += sizeof(d_dstARGB);

    //  set the Destination Pitch
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nDestPitch,   sizeof(nDestPitch)));
    offset += sizeof(nDestPitch);

    // set the width of the image
    ALIGN_OFFSET(offset, __alignof(width));
    cutilDrvSafeCall(cuParamSeti(fpFunc, offset, width));
    offset += sizeof(width);

    // set the height of the image
    ALIGN_OFFSET(offset, __alignof(height));
    cutilDrvSafeCall(cuParamSeti(fpFunc, offset, height));
    offset += sizeof(height);

    cutilDrvSafeCall(cuParamSetSize(fpFunc, offset));

    // Launching the kernel, we need to pass in the grid dimensions
    status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID);
#endif

    if (CUDA_SUCCESS != status)
    {
        fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %08x, retval = %d\n", (unsigned int)fpFunc, status);
        return status;
    }

    return status;
}
Esempio n. 2
0
int main( int argc, char** argv)
{
	uint num_threads;
	uint num_blocks, block_size;
	uint length;
	uint nBytes;
	int *list;
	int status, verbose, c, i, j, logBlocks;
	int read_stdin;
	struct timeval start_time, end_time;
	unsigned long total_time;
	CUdevice hDevice;
	CUcontext hContext;
	CUmodule hModule;
	CUfunction bitonicBlockFn;
	CUfunction mergeBlocksFn;
	CUdeviceptr pDeviceArrayA;
	CUdeviceptr pDeviceArrayB;

	status = SUCCESS;
	verbose = 0;
	read_stdin = FALSE;
	length = 0;

	while ((c = getopt (argc, argv, "dip:vO")) != -1) {
		switch (c) {
		case 'd':
			verbose |= GROSS_DEBUG;
			break;
		case 'i':
			read_stdin = TRUE;
		case 'O':
			verbose |= OUTPUT;
			break;
		case 'p':
			length = 1 << atoi(optarg);
			break;
		case 'v':
			verbose |= DEBUG;
			break;
		case '?':
		default:
			print_usage();
			return FAILURE;
		}
	}

	if ( read_stdin == TRUE ) {
		/* Read sequence of integers from stdin */
		list = (int*) malloc (INIT_INPUT_SIZE * sizeof(int) );
		length = readIntegers(list, INIT_INPUT_SIZE);
	} else if ( length > 0 ) {
		list = (int*) malloc (length * sizeof(int) );
		randomInts(list, length);
	} else if (optind >= argc) { /* No size was given */
		print_usage();
		return FAILURE;
	} else {
		/* Generate our own integers */
		length = atoi(argv[optind]);
		list = (int*) malloc (length * sizeof(int) );
		randomInts(list, length);
	}

	/*
	* Phase 1:
	* 	There will be one thread for each element to be sorted. Each
	*	block will perform bitonic sort on MAX_THREADS_PER_BLOCK elements.
	*/

	/* Initialize sizes */
	num_threads = _min(length, MAX_THREADS_PER_BLOCK );
	num_blocks = (length-1) / MAX_THREADS_PER_BLOCK + 1;
	nBytes = length * sizeof(int);

	if (verbose & DEBUG) printf("Initializing GPU.\n");
	
	/* Start timing */
	gettimeofday(&start_time, NULL);

	/* Initialize GPU */
	cutilDrvSafeCall( cuInit(0) 					);
	cutilDrvSafeCall( cuDeviceGet(&hDevice, 0)			); 
	cutilDrvSafeCall( cuCtxCreate(&hContext, 0, hDevice) 		);
	cutilDrvSafeCall( cuModuleLoad(&hModule, MODULE_FILE) 		);
	cutilDrvSafeCall( cuModuleGetFunction(&bitonicBlockFn, hModule, BITONIC_BLOCK_FN) );

	/* Allocate memory on the device */
	cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayA, nBytes)		);
	cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayB, nBytes)		);
	cutilDrvSafeCall( cuMemcpyHtoD(pDeviceArrayA, list, nBytes) 	);
	cutilDrvSafeCall( cuFuncSetBlockShape(bitonicBlockFn, num_threads, 1, 1));
	cutilDrvSafeCall( cuParamSeti(bitonicBlockFn, 0, pDeviceArrayA)	);
	cutilDrvSafeCall( cuParamSetSize(bitonicBlockFn, 4)		);
	
	/* Execute the kernel on the GPU */
	if ( verbose & DEBUG ) printf("Launching bitonic sort kernel with %d blocks and %d threads per block.\n", num_blocks, num_threads);
	cutilDrvSafeCall( cuLaunchGrid(bitonicBlockFn, num_blocks, 1)		);

	/*
	* Phase 2:
	* 	At this point each block is a sorted list. Now it's time to merge them.	
	*/

	/* TODO This should go away after development */
	if ( verbose & GROSS_DEBUG ) {
		cuMemcpyDtoH(list, pDeviceArrayA, nBytes);
		for (i=0; i<num_blocks; ++i) {
			printf("### Block %d:\n", i);
			for (j=0; j<num_threads; ++j) {
				printf("%d\n", list[i*num_threads + j]);
			}
		}
	}
	
	i=0;

	/* Do we need to merge blocks? */
	if ( num_blocks > 1 ) {

		/* There will be Log_2(num_blocks) merge steps. */
		logBlocks = 0;
		for (i=1; i<num_blocks; i *= 2)	++logBlocks;

		if ( verbose & DEBUG ) printf("There will be %d merge steps.\n", logBlocks);	

		block_size = num_threads; 	/* How big the blocks were in the last grid launch. */
		num_threads = num_blocks >> 1;  /* Start with blocks/2 threads */
		num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK  +  1;

		cutilDrvSafeCall( cuModuleGetFunction(&mergeBlocksFn, hModule, MERGE_BLOCKS_FN) );
		cuParamSeti(mergeBlocksFn, 4, block_size);
		cuParamSetSize(mergeBlocksFn, 16);

		for (i=0; i < logBlocks; ++i) {
			cuFuncSetBlockShape(mergeBlocksFn, num_threads, 1, 1);
			cuParamSeti(mergeBlocksFn, 0, i); /* set merge level */

			/* Merging uses a source array and destination array, the gpu has 2 arrays allocated
			 * so we swap which is the source and which is the destination for each iteration. */
			if ( i%2 == 0 ) {
				cuParamSeti(mergeBlocksFn, 8, pDeviceArrayA);
				cuParamSeti(mergeBlocksFn, 12, pDeviceArrayB);
			} else {
				cuParamSeti(mergeBlocksFn, 8, pDeviceArrayB);
				cuParamSeti(mergeBlocksFn, 12, pDeviceArrayA);
			}

			if ( verbose & DEBUG ) {
				printf("Launching block merge kernel with %d blocks and %d threads per block\n", 
									num_blocks, num_threads/num_blocks);
			}	
			cutilDrvSafeCall( cuLaunchGrid(mergeBlocksFn, num_blocks, 1) );

			num_threads = num_threads >> 1;
			num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK  +  1;
		}
	}