コード例 #1
0
F32 CudaModule::launchKernelTimed(CUfunction kernel, const Vec2i& blockSize, 
                                  const Vec2i& gridSize, bool async, 
                                  CUstream stream, bool yield)
{
  // Update globals before timing.
  updateGlobals();
  updateTexRefs(kernel);
  sync(false);


  // Events not supported => use CPU-based timer.
  if (!s_startEvent)
  {
    assert(0);//
#if 0
    Timer timer(true);
    launchKernel(kernel, blockSize, gridSize, async, stream);
    sync(false); // spin for more accurate timing
    return timer.getElapsed();
#endif
  }


  // Use events.
  checkError("cuEventRecord", cuEventRecord(s_startEvent, NULL));
  launchKernel(kernel, blockSize, gridSize, async, stream);
  checkError("cuEventRecord", cuEventRecord(s_endEvent, NULL));
  sync(yield);

  F32 time;
  checkError("cuEventElapsedTime", cuEventElapsedTime(&time, s_startEvent, s_endEvent));
  return time * 1.0e-3f;
}
コード例 #2
0
ファイル: cudapbo.cpp プロジェクト: jojona/Mandelbulb
// Run the Cuda part of the computation
void runCuda(glm::mat3 rot, glm::vec3 campos, float focalLength, LOD l) {
	uchar4 *dptr = NULL;

	// Map the buffer object
	size_t num_bytes;
	cudaGraphicsMapResources(1, &cuda_pbo, 0);

	// Get Address for kernel 
	cudaGraphicsResourceGetMappedPointer((void**)&dptr, &num_bytes, cuda_pbo);

	launchKernel(dptr, window_width, window_height, focalLength, rot, campos, l);

	// Unmap the buffer object
	cudaGraphicsUnmapResources(1, &cuda_pbo, 0);

}
コード例 #3
0
ファイル: main.cpp プロジェクト: kimsavinfo/OpenCL_BlurEffect
int main(int argc, const char * argv[])
{
    // Create a bitmap
    CPUBitmap bitmap(GLOBAL_DIM, GLOBAL_DIM);
    unsigned char *ptr = bitmap.get_ptr();
    initBitmap(ptr);
    
    // Import and launch kernels
    const char *custom_kernel = importKernel();
    launchKernel(custom_kernel, ptr);
    
    // Show the result
    bitmap.display_and_exit();
    
    return 0;
}
コード例 #4
0
ErrorCode GpuBinaryImageAlgorithm<InputPixelType, InputBandCount, OutputPixelType, OutputBandCount>::operator()(const cvt::cvTile<InputPixelType>& tile,
					const cvt::cvTile<InputPixelType> &tile2,const cvt::cvTile<OutputPixelType> ** outTile)
{
		//TO-DO Error Check Template Params for Type/Bounds

	const cv::Size2i tileSize = tile.getSize();
	
	if (tileSize != this->dataSize)
	{
		std::stringstream ss;
		ss << tileSize << " expected of " << this->dataSize << std::endl; 
		throw std::runtime_error(ss.str());
	}

	if (tileSize != tile2.getSize()) {
		throw std::runtime_error("Both the incoming tiles must have different sizes");
	}

	/*
	 *  Copy data down for tile using the parents implementation
	 */
	this->lastError = this->copyTileToDevice(tile);
	if (this->lastError != cvt::Ok)
	{
		throw std::runtime_error("Failed to copy tile to device");
	}
	std::string inTypeIdentifier(typeid(this->tempForTypeTesting).name());
	size_t bitDepth = 0;
	cudaChannelFormatDesc inputDescriptor;

	if(inTypeIdentifier == "a" || 
	   inTypeIdentifier == "s" || 
	   inTypeIdentifier == "i" ||
	   inTypeIdentifier == "l")
	{
		this->channelType = cudaChannelFormatKindSigned;
	}
	else if(inTypeIdentifier == "h" || 
			inTypeIdentifier == "t" || 
			inTypeIdentifier == "j" || 
			inTypeIdentifier == "m")
	{
		this->channelType = cudaChannelFormatKindUnsigned;
	}
	else if(inTypeIdentifier == "f" || 
			inTypeIdentifier == "d") 
	{
		this->channelType = cudaChannelFormatKindFloat;
	}
	else
	{
		this->lastError = InitFailUnsupportedInputType;
		return this->lastError;
	}

	bitDepth = sizeof(this->tempForTypeTesting) * 8;

	cudaError cuer;
	inputDescriptor = cudaCreateChannelDesc(bitDepth, 0, 0, 0, this->channelType);
	cuer = cudaGetLastError();
	
	if (cuer != cudaSuccess) {
		this->lastError = CudaError;
		std::cout << "CUDA ERR = " << cuer << std::endl;
		throw std::runtime_error("GPU BINARY IMAGE RUN FAILED TO CREATE CHANNEL");
	}

	cudaChannelFormatDesc input_descriptor = cudaCreateChannelDesc(bitDepth, 0, 0, 0, cudaChannelFormatKindUnsigned);
	cudaMallocArray((cudaArray**)&gpuInputDataArrayTwo_, &input_descriptor, this->dataSize.width, this->dataSize.height);
	const unsigned int offsetX = 0;
	const unsigned int offsetY = 0;
	const unsigned char* tile_data_ptr = &(tile2[0].data[0]);
	const unsigned int tileArea = tile.getSize().area();

	cudaMemcpyToArrayAsync(gpuInputDataArrayTwo_,	// the device | destination address
			offsetX , offsetY,  			// the destination offsets
			tile_data_ptr,					// the host | source address
			sizeof(InputPixelType) * tileArea,		// the size of the copy in bytes
			cudaMemcpyHostToDevice,			// the type of the copy
			this->stream);						// the device command stream

	cuer = cudaGetLastError();
	if (cuer != cudaSuccess) {
		std::cout << "CUDA ERR = " << cuer << std::endl;
		throw std::runtime_error("GPU BINARY IMAGE RUN FAILED TO ALLOCATE MEMORY");
	}

	// Invoke kernel with empirically chosen block size
	unsigned short bW = 16;
	unsigned short bH = 16;

	launchKernel(bW, bH);

	this->lastError = this->copyTileFromDevice(outTile);
	if(this->lastError != cvt::Ok) {
		std::runtime_error("Failed copy off tile from device");
	}
	return Ok;
}