F32 CudaModule::launchKernelTimed(CUfunction kernel, const Vec2i& blockSize, const Vec2i& gridSize, bool async, CUstream stream, bool yield) { // Update globals before timing. updateGlobals(); updateTexRefs(kernel); sync(false); // Events not supported => use CPU-based timer. if (!s_startEvent) { assert(0);// #if 0 Timer timer(true); launchKernel(kernel, blockSize, gridSize, async, stream); sync(false); // spin for more accurate timing return timer.getElapsed(); #endif } // Use events. checkError("cuEventRecord", cuEventRecord(s_startEvent, NULL)); launchKernel(kernel, blockSize, gridSize, async, stream); checkError("cuEventRecord", cuEventRecord(s_endEvent, NULL)); sync(yield); F32 time; checkError("cuEventElapsedTime", cuEventElapsedTime(&time, s_startEvent, s_endEvent)); return time * 1.0e-3f; }
// Run the Cuda part of the computation void runCuda(glm::mat3 rot, glm::vec3 campos, float focalLength, LOD l) { uchar4 *dptr = NULL; // Map the buffer object size_t num_bytes; cudaGraphicsMapResources(1, &cuda_pbo, 0); // Get Address for kernel cudaGraphicsResourceGetMappedPointer((void**)&dptr, &num_bytes, cuda_pbo); launchKernel(dptr, window_width, window_height, focalLength, rot, campos, l); // Unmap the buffer object cudaGraphicsUnmapResources(1, &cuda_pbo, 0); }
int main(int argc, const char * argv[]) { // Create a bitmap CPUBitmap bitmap(GLOBAL_DIM, GLOBAL_DIM); unsigned char *ptr = bitmap.get_ptr(); initBitmap(ptr); // Import and launch kernels const char *custom_kernel = importKernel(); launchKernel(custom_kernel, ptr); // Show the result bitmap.display_and_exit(); return 0; }
ErrorCode GpuBinaryImageAlgorithm<InputPixelType, InputBandCount, OutputPixelType, OutputBandCount>::operator()(const cvt::cvTile<InputPixelType>& tile, const cvt::cvTile<InputPixelType> &tile2,const cvt::cvTile<OutputPixelType> ** outTile) { //TO-DO Error Check Template Params for Type/Bounds const cv::Size2i tileSize = tile.getSize(); if (tileSize != this->dataSize) { std::stringstream ss; ss << tileSize << " expected of " << this->dataSize << std::endl; throw std::runtime_error(ss.str()); } if (tileSize != tile2.getSize()) { throw std::runtime_error("Both the incoming tiles must have different sizes"); } /* * Copy data down for tile using the parents implementation */ this->lastError = this->copyTileToDevice(tile); if (this->lastError != cvt::Ok) { throw std::runtime_error("Failed to copy tile to device"); } std::string inTypeIdentifier(typeid(this->tempForTypeTesting).name()); size_t bitDepth = 0; cudaChannelFormatDesc inputDescriptor; if(inTypeIdentifier == "a" || inTypeIdentifier == "s" || inTypeIdentifier == "i" || inTypeIdentifier == "l") { this->channelType = cudaChannelFormatKindSigned; } else if(inTypeIdentifier == "h" || inTypeIdentifier == "t" || inTypeIdentifier == "j" || inTypeIdentifier == "m") { this->channelType = cudaChannelFormatKindUnsigned; } else if(inTypeIdentifier == "f" || inTypeIdentifier == "d") { this->channelType = cudaChannelFormatKindFloat; } else { this->lastError = InitFailUnsupportedInputType; return this->lastError; } bitDepth = sizeof(this->tempForTypeTesting) * 8; cudaError cuer; inputDescriptor = cudaCreateChannelDesc(bitDepth, 0, 0, 0, this->channelType); cuer = cudaGetLastError(); if (cuer != cudaSuccess) { this->lastError = CudaError; std::cout << "CUDA ERR = " << cuer << std::endl; throw std::runtime_error("GPU BINARY IMAGE RUN FAILED TO CREATE CHANNEL"); } cudaChannelFormatDesc input_descriptor = cudaCreateChannelDesc(bitDepth, 0, 0, 0, cudaChannelFormatKindUnsigned); cudaMallocArray((cudaArray**)&gpuInputDataArrayTwo_, &input_descriptor, this->dataSize.width, this->dataSize.height); const unsigned int offsetX = 0; const unsigned int offsetY = 0; const unsigned char* tile_data_ptr = &(tile2[0].data[0]); const unsigned int tileArea = tile.getSize().area(); cudaMemcpyToArrayAsync(gpuInputDataArrayTwo_, // the device | destination address offsetX , offsetY, // the destination offsets tile_data_ptr, // the host | source address sizeof(InputPixelType) * tileArea, // the size of the copy in bytes cudaMemcpyHostToDevice, // the type of the copy this->stream); // the device command stream cuer = cudaGetLastError(); if (cuer != cudaSuccess) { std::cout << "CUDA ERR = " << cuer << std::endl; throw std::runtime_error("GPU BINARY IMAGE RUN FAILED TO ALLOCATE MEMORY"); } // Invoke kernel with empirically chosen block size unsigned short bW = 16; unsigned short bH = 16; launchKernel(bW, bH); this->lastError = this->copyTileFromDevice(outTile); if(this->lastError != cvt::Ok) { std::runtime_error("Failed copy off tile from device"); } return Ok; }