TransferFunction::TransferFunction(vtkSmartPointer<vtkPiecewiseFunction> otf, vtkSmartPointer<vtkColorTransferFunction> ctf, QObject *parent) : QObject(parent) { opacityTF = otf; colorTF = ctf; this->otf = QSharedPointer<ctkTransferFunction>(new ctkVTKPiecewiseFunction(opacityTF)); this->ctf = QSharedPointer<ctkTransferFunction>(new ctkVTKColorTransferFunction(colorTF)); connect(this->otf.data(), SIGNAL(changed()), this, SLOT(onOpacityTFChanged())); connect(this->ctf.data(), SIGNAL(changed()), this, SLOT(onColorTFChanged())); compositeTex = 0; // initialize each table opacityTF->GetTable(0.0, 1.0, TABLE_SIZE, opacityTable); colorTF->GetTable(0.0, 1.0, TABLE_SIZE, colorTable); CompositeTable(); channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat); CudaSafeCall(cudaMallocArray(&array, &channelDesc, TABLE_SIZE)); CudaSafeCall(cudaMemcpyToArray(array, 0, 0, compositeTable, sizeof(float) * TABLE_SIZE * 4, cudaMemcpyHostToDevice)); memset(&resourceDesc, 0, sizeof(resourceDesc)); resourceDesc.resType = cudaResourceTypeArray; resourceDesc.res.array.array = array; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeClamp; texDesc.filterMode = cudaFilterModeLinear; texDesc.normalizedCoords = true; texDesc.readMode = cudaReadModeElementType; CudaSafeCall(cudaCreateTextureObject(&compositeTex, &resourceDesc, &texDesc, NULL)); }
__host__ inline TextureArray(const Vector2i& sizes) : _sizes{ sizes } { auto channel_descriptor = ChannelFormatDescriptor<T>::type(); SHAKTI_SAFE_CUDA_CALL(cudaMallocArray( &_array, &channel_descriptor, sizes(0), sizes(1))); }
inline cudaArray* MallocArray2D< float2 >( VolumeDescription volumeDescription ) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc( 32, 32, 0, 0, cudaChannelFormatKindFloat ); cudaArray* cuArray; MOJO_CUDA_SAFE( cudaMallocArray( &cuArray, &channelDesc, volumeDescription.numVoxels.x, volumeDescription.numVoxels.y ) ); return cuArray; }
static void addImageToTextureUint (vector<Mat_<uint8_t> > &imgs, cudaTextureObject_t texs[]) { for (unsigned int i=0; i<imgs.size(); i++) { int rows = imgs[i].rows; int cols = imgs[i].cols; // Create channel with uint8_t point type cudaChannelFormatDesc channelDesc = //cudaCreateChannelDesc (8, //0, //0, //0, //cudaChannelFormatKindUnsigned); cudaCreateChannelDesc<char>(); // Allocate array with correct size and number of channels cudaArray *cuArray; checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, cols, rows)); checkCudaErrors (cudaMemcpy2DToArray (cuArray, 0, 0, imgs[i].ptr<uint8_t>(), imgs[i].step[0], cols*sizeof(uint8_t), rows, cudaMemcpyHostToDevice)); // Specify texture struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = cuArray; // Specify texture object parameters struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeWrap; texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.filterMode = cudaFilterModePoint; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 0; // Create texture object //cudaTextureObject_t &texObj = texs[i]; checkCudaErrors(cudaCreateTextureObject(&(texs[i]), &resDesc, &texDesc, NULL)); //texs[i] = texObj; } return; }
cudaTextureObject_t create_environment_light_texture(const std::string& filename) { int w = 0, h = 0, n = 0; float* data = stbi_loadf(filename.c_str(), &w, &h, &n, 0); if(!data) { std::cerr<<"Unable to load environment map: "<<filename<<std::endl; exit(0); } //create channel desc cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>(); //create cudaArray cudaArray* array; checkCudaErrors(cudaMallocArray(&array, &channelDesc, w, h)); if(n == 3) { uint32_t count = w * h; std::vector<float4> ext_data; ext_data.reserve(count); for(auto i = 0; i < count; ++i) ext_data.push_back(make_float4(data[i * 3], data[i * 3 + 1], data[i * 3 + 2], 0.f)); checkCudaErrors(cudaMemcpyToArray(array, 0, 0, ext_data.data(), sizeof(float4) * w * h, cudaMemcpyHostToDevice)); } else checkCudaErrors(cudaMemcpyToArray(array, 0, 0, data, sizeof(float4) * w * h, cudaMemcpyHostToDevice)); //create resource desc cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = array; //create texture desc cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeWrap; texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = true; //create cudaTextureObject cudaTextureObject_t tex; checkCudaErrors(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); return tex; }
static void addImageToTextureFloatColor (vector<Mat > &imgs, cudaTextureObject_t texs[]) { for (size_t i=0; i<imgs.size(); i++) { int rows = imgs[i].rows; int cols = imgs[i].cols; // Create channel with floating point type cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>(); // Allocate array with correct size and number of channels cudaArray *cuArray; checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, cols, rows)); checkCudaErrors (cudaMemcpy2DToArray (cuArray, 0, 0, imgs[i].ptr<float>(), imgs[i].step[0], cols*sizeof(float)*4, rows, cudaMemcpyHostToDevice)); // Specify texture struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = cuArray; // Specify texture object parameters struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeWrap; texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 0; // Create texture object checkCudaErrors(cudaCreateTextureObject(&(texs[i]), &resDesc, &texDesc, NULL)); } return; }
CudaFloatTexture1D::CudaFloatTexture1D(int width, const double *data, CudaAction action, cudaStream_t stream, CudaInternalAPI *api) { channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); // Allocate the texture on the GPU... CUDA_SAFE_CALL(cudaMallocArray(&deviceArray, &channelDesc, width, 1)); // ... and in page-locked system memory CUDA_SAFE_CALL(cudaMallocHost((void**)&hostMem, sizeof(float) * width)); // Convert doubles to floats and save them to page-locked system memory std::transform(data, data + width, hostMem, typecast<float, double>); // Copy floats from the page-locked memory to the GPU CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(deviceArray, 0, 0, hostMem, sizeof(float) * width, cudaMemcpyHostToDevice, stream)); if (action == BindToKernel) api->setDistDepDielTexture(deviceArray, &channelDesc); }
void costVol_chamo::updataCV(Mat refImg, Mat projMat, float weightPerImg){ cudaArray* cuArray; cudaTextureObject_t texObj; cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned); cudaSafeCall(cudaMallocArray(&cuArray, &channelDesc, width, height)); cudaMemcpyToArray(cuArray, 0, 0, refImg.data, width*height*sizeof(float), cudaMemcpyHostToDevice); cudaSafeCall(cudaGetLastError()); struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = cuArray; struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeClamp; texDesc.addressMode[1] = cudaAddressModeClamp; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeNormalizedFloat; texDesc.normalizedCoords = 0; cudaSafeCall(cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL)); Mat finalTran = projMat*baseImgProjMat.inv(); cvInput input; input.baseImg = (float *)baseImg.data; input.cvData = (float*)cvData.data; input.nearZ = nearZ; input.farZ = farZ; input.height = height; input.width = width; input.lowInd = (float*)lowInd.data; input.lowValue = (float*)bottonVal.data; for (int i = 0; i < 12; i++){ input.transMat[i] = finalTran.at<float>(i); } input.refImg = texObj; input.zStep = (nearZ - farZ) / layers; input.stepCount = layers; updataCount++; input.weightPerImg = 1.0 / updataCount; updataCVCaller(input); }
TEST(PointerGetAttributes, Array) { struct cudaArray * ary; cudaError_t ret; struct cudaChannelFormatDesc dsc; dsc.x = dsc.y = dsc.z = dsc.w = 8; dsc.f = cudaChannelFormatKindSigned; int device; ret = cudaGetDevice(&device); ASSERT_EQ(cudaSuccess, ret); ret = cudaMallocArray(&ary, &dsc, 1, 1, 0); ASSERT_EQ(cudaSuccess, ret); struct cudaPointerAttributes attr; ret = cudaPointerGetAttributes(&attr, ary); EXPECT_EQ(cudaErrorInvalidValue, ret); ret = cudaFreeArray(ary); ASSERT_EQ(cudaSuccess, ret); }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; cudaArray *a_Src; cudaChannelFormatDesc floatTex = cudaCreateChannelDesc<float>(); float *d_Output; float gpuTime; StopWatchInterface *hTimer = NULL; const int imageW = 3072; const int imageH = 3072 / 2; const unsigned int iterations = 10; printf("[%s] - Starting...\n", argv[0]); // use command-line specified CUDA device, otherwise use device with highest Gflops/s findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); printf("Initializing data...\n"); h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float)); h_Input = (float *)malloc(imageW * imageH * sizeof(float)); h_Buffer = (float *)malloc(imageW * imageH * sizeof(float)); h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float)); h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float)); checkCudaErrors(cudaMallocArray(&a_Src, &floatTex, imageW, imageH)); checkCudaErrors(cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float))); srand(2009); for (unsigned int i = 0; i < KERNEL_LENGTH; i++) { h_Kernel[i] = (float)(rand() % 16); } for (unsigned int i = 0; i < imageW * imageH; i++) { h_Input[i] = (float)(rand() % 16); } setConvolutionKernel(h_Kernel); checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice)); printf("Running GPU rows convolution (%u identical iterations)...\n", iterations); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (unsigned int i = 0; i < iterations; i++) { convolutionRowsGPU( d_Output, a_Src, imageW, imageH ); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations; printf("Average convolutionRowsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); //While CUDA kernels can't write to textures directly, this copy is inevitable printf("Copying convolutionRowGPU() output back to the texture...\n"); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToDevice)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer); printf("cudaMemcpyToArray() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); printf("Running GPU columns convolution (%i iterations)\n", iterations); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iterations; i++) { convolutionColumnsGPU( d_Output, a_Src, imageW, imageH ); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations; printf("Average convolutionColumnsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); printf("Reading back GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost)); printf("Checking the results...\n"); printf("...running convolutionRowsCPU()\n"); convolutionRowsCPU( h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS ); printf("...running convolutionColumnsCPU()\n"); convolutionColumnsCPU( h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS ); double delta = 0; double sum = 0; for (unsigned int i = 0; i < imageW * imageH; i++) { sum += h_OutputCPU[i] * h_OutputCPU[i]; delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]); } double L2norm = sqrt(delta / sum); printf("Relative L2 norm: %E\n", L2norm); printf("Shutting down...\n"); checkCudaErrors(cudaFree(d_Output)); checkCudaErrors(cudaFreeArray(a_Src)); free(h_OutputGPU); free(h_Buffer); free(h_Input); free(h_Kernel); sdkDeleteTimer(&hTimer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (L2norm > 1e-6) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed\n"); exit(EXIT_SUCCESS); }
ErrorCode GpuBinaryImageAlgorithm<InputPixelType, InputBandCount, OutputPixelType, OutputBandCount>::initializeDevice() { /* * Attempts to check the GPU and begin warm up * */ this->lastError = this->setGpuDevice(); if(this->lastError) return this->lastError; if (this->properties.getMajorCompute() < 1) { this->lastError = InitFailNoCUDA; return this->lastError; } /* * Verfies the properities of GPU * */ cudaStreamCreate(&this->stream); cudaError cuer = cudaGetLastError(); if(cuer != cudaSuccess){ this->lastError = InitFailcuStreamCreateErrorcudaErrorInvalidValue; return this->lastError; } //Set descriptor of input data before allocation // Sets is at single channel, 16-bit unsigned integer std::string inTypeIdentifier(typeid(this->tempForTypeTesting).name()); size_t bitDepth = 0; cudaChannelFormatDesc inputDescriptor; if(inTypeIdentifier == "a" || inTypeIdentifier == "s" || inTypeIdentifier == "i" || inTypeIdentifier == "l") { this->channelType = cudaChannelFormatKindSigned; } else if(inTypeIdentifier == "h" || inTypeIdentifier == "t" || inTypeIdentifier == "j" || inTypeIdentifier == "m") { this->channelType = cudaChannelFormatKindUnsigned; } else if(inTypeIdentifier == "f" || inTypeIdentifier == "d") { this->channelType = cudaChannelFormatKindFloat; } else { this->lastError = InitFailUnsupportedInputType; return this->lastError; } bitDepth = sizeof(this->tempForTypeTesting) * 8; inputDescriptor = cudaCreateChannelDesc(bitDepth, 0, 0, 0, this->channelType); cuer = cudaGetLastError(); if (cuer != cudaSuccess) { this->lastError = CudaError; std::cout << "CUDA ERR = " << cuer << std::endl; throw std::runtime_error("GPU WHS INIT FAILED TO CREATE CHANNEL"); } if(cuer != cudaSuccess){ this->lastError = CudaError; return this->lastError; } ////////////////////////////////////////////////////////// // ALLOCATE MEMORY FOR GPU INPUT AND OUTPUT DATA (TILE) // ///////////////////////////////////////////////////////// cuer = cudaGetLastError(); /*Gpu Input Data*/ cudaMallocArray( (cudaArray**)&this->gpuInputDataArray, &inputDescriptor, this->dataSize.width, this->dataSize.height ); this->gpuInput = this->gpuInputDataArray; cuer = cudaGetLastError(); if (cuer != cudaSuccess) { std::cout << "CUDA ERR = " << cuer << std::endl; throw std::runtime_error("GPU WHS INIT FAILED TO ALLOCATE MEMORY"); } this->usingTexture = true; //Gpu Output Data const size_t bytes = this->dataSize.width * this->dataSize.height * OutputBandCount * sizeof(OutputPixelType); this->outputDataSize = bytes; cudaMalloc((void**) &this->gpuOutputData, bytes); cuer = cudaGetLastError(); if (cuer != cudaSuccess) { throw new std::runtime_error("GPU WHS INIT FAILED TO ALLOCATE OUTPUT MEMORY"); } if (cuer == cudaErrorMemoryAllocation) { this->lastError = InitFailcuOutArrayMemErrorcudaErrorMemoryAllocation; return this->lastError; } ////////////////////////////////////////////////////////////////////////////////////// // CALL FUNCTION TO ALLOCATE ADDITIONAL GPU STORAGE - DOES NOTHING IF NOT OVERRIDEN // ///////////////////////////////////////////////////////////////////////////////////// /* Initialize the neighborhood coordinates */ /*uses two ints to store width and height coords by the windowRadius_*/ /* * Allocates the memory needed for the results coming back from the GPU * */ this->lastError = this->allocateAdditionalGpuMemory(); return this->lastError; }
ErrorCode GpuBinaryImageAlgorithm<InputPixelType, InputBandCount, OutputPixelType, OutputBandCount>::operator()(const cvt::cvTile<InputPixelType>& tile, const cvt::cvTile<InputPixelType> &tile2,const cvt::cvTile<OutputPixelType> ** outTile) { //TO-DO Error Check Template Params for Type/Bounds const cv::Size2i tileSize = tile.getSize(); if (tileSize != this->dataSize) { std::stringstream ss; ss << tileSize << " expected of " << this->dataSize << std::endl; throw std::runtime_error(ss.str()); } if (tileSize != tile2.getSize()) { throw std::runtime_error("Both the incoming tiles must have different sizes"); } /* * Copy data down for tile using the parents implementation */ this->lastError = this->copyTileToDevice(tile); if (this->lastError != cvt::Ok) { throw std::runtime_error("Failed to copy tile to device"); } std::string inTypeIdentifier(typeid(this->tempForTypeTesting).name()); size_t bitDepth = 0; cudaChannelFormatDesc inputDescriptor; if(inTypeIdentifier == "a" || inTypeIdentifier == "s" || inTypeIdentifier == "i" || inTypeIdentifier == "l") { this->channelType = cudaChannelFormatKindSigned; } else if(inTypeIdentifier == "h" || inTypeIdentifier == "t" || inTypeIdentifier == "j" || inTypeIdentifier == "m") { this->channelType = cudaChannelFormatKindUnsigned; } else if(inTypeIdentifier == "f" || inTypeIdentifier == "d") { this->channelType = cudaChannelFormatKindFloat; } else { this->lastError = InitFailUnsupportedInputType; return this->lastError; } bitDepth = sizeof(this->tempForTypeTesting) * 8; cudaError cuer; inputDescriptor = cudaCreateChannelDesc(bitDepth, 0, 0, 0, this->channelType); cuer = cudaGetLastError(); if (cuer != cudaSuccess) { this->lastError = CudaError; std::cout << "CUDA ERR = " << cuer << std::endl; throw std::runtime_error("GPU BINARY IMAGE RUN FAILED TO CREATE CHANNEL"); } cudaChannelFormatDesc input_descriptor = cudaCreateChannelDesc(bitDepth, 0, 0, 0, cudaChannelFormatKindUnsigned); cudaMallocArray((cudaArray**)&gpuInputDataArrayTwo_, &input_descriptor, this->dataSize.width, this->dataSize.height); const unsigned int offsetX = 0; const unsigned int offsetY = 0; const unsigned char* tile_data_ptr = &(tile2[0].data[0]); const unsigned int tileArea = tile.getSize().area(); cudaMemcpyToArrayAsync(gpuInputDataArrayTwo_, // the device | destination address offsetX , offsetY, // the destination offsets tile_data_ptr, // the host | source address sizeof(InputPixelType) * tileArea, // the size of the copy in bytes cudaMemcpyHostToDevice, // the type of the copy this->stream); // the device command stream cuer = cudaGetLastError(); if (cuer != cudaSuccess) { std::cout << "CUDA ERR = " << cuer << std::endl; throw std::runtime_error("GPU BINARY IMAGE RUN FAILED TO ALLOCATE MEMORY"); } // Invoke kernel with empirically chosen block size unsigned short bW = 16; unsigned short bH = 16; launchKernel(bW, bH); this->lastError = this->copyTileFromDevice(outTile); if(this->lastError != cvt::Ok) { std::runtime_error("Failed copy off tile from device"); } return Ok; }
void CudaImagePyramidHost::initialize(int width, int height, cudaTextureFilterMode filter_mode, int depth) { qDebug() << "pyramid host initializing with params: " << width << height << filter_mode << depth; if (isInitialized() && width == _baseWidth && height == _baseHeight && filter_mode == _filterMode) { return; } clear(); qDebug() << "Clear done."; _baseWidth = width; _baseHeight = height; _filterMode = filter_mode; _numLayers = depth; // Get the texture and its channel descriptor to allocate the storage. const textureReference* constTexRefPtr=NULL; cudaGetTextureReference(&constTexRefPtr, _texture_name); qDebug() << "Texture Ref got:" << _name; if (constTexRefPtr == 0) { qDebug() << "constTexRefPtr==0"; } checkCUDAError("Can't get tex ref for init TEXTURE_PYRAMID", _name); cudaChannelFormatDesc formatDesc = constTexRefPtr->channelDesc; if(_textureType == cudaTextureType2DLayered){ cudaDeviceProp prop; qDebug() << "to get CUDA device prop"; cudaGetDeviceProperties(&prop,0); qDebug() << "CUDA Device Prop got"; if(prop.maxTexture2DLayered[0] < _baseWidth || prop.maxTexture2DLayered[1] < _baseHeight || prop.maxTexture2DLayered[2] < _numLayers){ qDebug()<< "Max layered texture size:" << prop.maxTexture2DLayered[0] << " x " << prop.maxTexture2DLayered[1] << " x " << prop.maxTexture2DLayered[2]; assert(0); } cudaExtent extent = make_cudaExtent(_baseWidth, _baseHeight, _numLayers); cudaMalloc3DArray(&_storage, &formatDesc, extent, cudaArrayLayered); }else{ cudaMallocArray(&_storage, &formatDesc, _baseWidth, _baseHeight); } checkCUDAError("Failure to allocate", _name); qDebug() << "allocate done"; // Set texture parameters. // Evil hack to get around an apparent bug in the cuda api: // cudaGetTextureReference only returns a const reference, and // there is no way to set the parameters with a reference other // than cast it to non-const. textureReference* texRefPtr=NULL; texRefPtr = const_cast<textureReference*>( constTexRefPtr ); texRefPtr->addressMode[0] = cudaAddressModeClamp; texRefPtr->addressMode[1] = cudaAddressModeClamp; texRefPtr->filterMode = filter_mode; texRefPtr->normalized = false; // Use unnormalized (pixel) coordinates for addressing. This forbids texture mode wrap. bindTexture(); qDebug() << "texture binded"; bool found = false; for (size_t i = 0; i < _instances.size(); i++) { if (_instances[i] == this) found = true; } if (!found) { qDebug() << "Not found"; _instances.push_back(this); } qDebug() << "paramid host initialized."; }
cudaError_t WINAPI wine_cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, unsigned int flags) { WINE_TRACE("\n"); return cudaMallocArray( array, desc, width, height, flags); }
cudaError_t WINAPI wine_cudaMallocArray( struct cudaArray** array, const struct cudaChannelFormatDesc* desc, size_t width, size_t height ){ WINE_TRACE("\n"); return cudaMallocArray( array, desc, width, height); }
void InitCudaLayers() { mmGridSizeX = sim_width/blockSizex; mmGridSizeY = sim_height/blockSizey; mmGridSize = mmGridSizeX*mmGridSizeY; memset(mmGrid, 0, sizeof(mmGrid)); memset(mmYGGrid, 0, sizeof(mmYGGrid)); tempHostData = (float*)malloc(sim_width*sim_height*TEMP_HOST_ELEM*sizeof(float)); tempHostDataNoCuda = (float*)malloc(sim_width*sim_height*TEMP_HOST_ELEM*sizeof(float)); grid8ValTick = (float*)malloc(sim_width*sim_height*8*sizeof(float)); initColors(); memset(gCudaLayer, 0, sizeof(gCudaLayer)); memset(gCudaFuncLayer, 0, sizeof(gCudaFuncLayer)); memset(gPhysLayer, 0, sizeof(gPhysLayer)); memset(gStateLayer, 0, sizeof(gStateLayer)); srand(0); int seed = rand(); const cudaChannelFormatDesc desc4 = cudaCreateChannelDesc<float4>(); cudaMallocArray(&gCudaVectArray, &desc4, sim_width, sim_height); #if NFLAYERS ==2 const cudaChannelFormatDesc desc2 = cudaCreateChannelDesc<float2>(); #else if NFLAYERS ==4 const cudaChannelFormatDesc descF = desc4; #endif cudaMallocArray(&gCudaFlArray, &descF, sim_width, sim_height); const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>(); cudaMallocArray(&gCudaFuncWavePack, &desc, sim_width); cudaMallocArray(&gCudaFuncSmooth, &desc, sim_width); cudaMallocArray(&(gCudaLayer[0]), &desc, sim_width, sim_height); cudaMallocArray(&(gCudaLayer[1]), &desc, sim_width, sim_height); cudaMallocArray(&(gCudaFuncLayer[0]), &desc, sim_width, sim_height); cudaMalloc(&cuTempData, TEMP_SIZE*sizeof(float)*sim_width*sim_height); cudaMalloc(&cuRandArr, sizeof(unsigned int)*sim_width*sim_height); cudaMalloc(&gStateLayer[0], sim_rect*sizeof(float)); cudaMemset(gStateLayer[0], 0, sim_rect*sizeof(float)); cudaMalloc(&gStateLayer[1], sim_rect*sizeof(float)); cudaMemset(gStateLayer[1], 0, sim_rect*sizeof(float)); cudaMalloc(&gPhysLayer[0], sim_rect*sizeof(float)); cudaMemset(gPhysLayer[0], 0, sim_rect*sizeof(float)); cudaMalloc(&gPhysLayer[1], sim_rect*sizeof(float)); cudaMemset(gPhysLayer[1], 0, sim_rect*sizeof(float)); cudaMalloc(&gRedBlueField, NFLAYERS*sim_rect*sizeof(float)); cudaMemset(gRedBlueField, 0, NFLAYERS*sim_rect*sizeof(float)); size_t pitch = 4*sim_width*sizeof(float); cudaMallocPitch((void**)&gVectorLayer, &pitch, 4*sim_width*sizeof(float), sim_height); cudaMemset2D(gVectorLayer, 4*sim_width*sizeof(float), 0, 4*sim_width*sizeof(float), sim_height); InitWavePack(32, 1.f, sim_width, sim_height, cuTempData, gCudaFuncWavePack); InitSmooth(1, sim_width, cuTempData, gCudaFuncSmooth); InitRnd2DInt(seed, cuRandArr, sim_width, sim_height); InitFuncLayer(gCudaFuncLayer[0], cuTempData, sim_width, sim_height); InitPhysLayer(gPhysLayer[0], gStateLayer[0], cuRandArr, sim_width, sim_height); float* gridIni = cuTempData+3*sim_rect/2; float* halfTemp = cuTempData + sim_rect; float* out = cuTempData + 2*sim_rect; cudaMemset(out, 0, sim_rect*sizeof(float)); seed = rand(); int gridx = INTERP_SIZEX; int gridy = INTERP_SIZEX; InitRnd2DF(seed, gridIni, gridx, gridy); float scaleadd = .7f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); seed = rand(); gridx = (int)(gridx*2); gridy = (int)(gridy*2); InitRnd2DF(seed, gridIni, gridx, gridy); scaleadd = .3f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); cudaMemcpyToArray(gCudaLayer[0], 0, 0, out, sizeof(float)*sim_rect, cudaMemcpyDeviceToDevice); cudaMemset(out, 0, sim_rect*sizeof(float)); gridx = INTERP_SIZEX; gridy = INTERP_SIZEX; seed = rand(); InitRnd2DF(seed, gridIni, gridx, gridy); scaleadd = .7f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); seed = rand(); gridx = (int)(gridx*1.5); gridy = (int)(gridy*1.5); InitRnd2DF(seed, gridIni, gridx, gridy); scaleadd = .3f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); cudaMemcpyToArray(gCudaLayer[1], 0, 0, out, sizeof(float)*sim_rect, cudaMemcpyDeviceToDevice); float2 pos0; pos0.x = gObj0X; pos0.y = gObj0Y; float2 pos1; pos1.x = gObj1X; pos1.y = gObj1Y; gObjInertia.Init(pos0, pos1); LayerProc(sim_width, sim_height, gCudaLayer[0], gCudaFuncLayer[0], cuTempData, pos0.x , pos0.y, pos1.x , pos1.y); ParticleStateInit(cuTempData, cuRandArr, gStateLayer[0], gPhysLayer[0], gRedBlueField); InitBhv(); }