C++ (Cpp) cudaMallocArray 예제들

예제 #1

0

파일 보기

파일: transferfunction.cpp 프로젝트: sunwj/SunVRVR

TransferFunction::TransferFunction(vtkSmartPointer<vtkPiecewiseFunction> otf, vtkSmartPointer<vtkColorTransferFunction> ctf, QObject *parent) : QObject(parent)
{
    opacityTF = otf;
    colorTF = ctf;

    this->otf = QSharedPointer<ctkTransferFunction>(new ctkVTKPiecewiseFunction(opacityTF));
    this->ctf = QSharedPointer<ctkTransferFunction>(new ctkVTKColorTransferFunction(colorTF));

    connect(this->otf.data(), SIGNAL(changed()), this, SLOT(onOpacityTFChanged()));
    connect(this->ctf.data(), SIGNAL(changed()), this, SLOT(onColorTFChanged()));

    compositeTex = 0;

    // initialize each table
    opacityTF->GetTable(0.0, 1.0, TABLE_SIZE, opacityTable);
    colorTF->GetTable(0.0, 1.0, TABLE_SIZE, colorTable);
    CompositeTable();

    channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
    CudaSafeCall(cudaMallocArray(&array, &channelDesc, TABLE_SIZE));
    CudaSafeCall(cudaMemcpyToArray(array, 0, 0, compositeTable, sizeof(float) * TABLE_SIZE * 4, cudaMemcpyHostToDevice));

    memset(&resourceDesc, 0, sizeof(resourceDesc));
    resourceDesc.resType = cudaResourceTypeArray;
    resourceDesc.res.array.array = array;

    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.addressMode[0] = cudaAddressModeClamp;
    texDesc.filterMode = cudaFilterModeLinear;
    texDesc.normalizedCoords = true;
    texDesc.readMode = cudaReadModeElementType;

    CudaSafeCall(cudaCreateTextureObject(&compositeTex, &resourceDesc, &texDesc, NULL));
}

예제 #2

0

파일 보기

파일: TextureArray.hpp 프로젝트: DO-CV/shakti

 __host__
 inline TextureArray(const Vector2i& sizes)
   : _sizes{ sizes }
 {
   auto channel_descriptor = ChannelFormatDescriptor<T>::type();
   SHAKTI_SAFE_CUDA_CALL(cudaMallocArray(
     &_array, &channel_descriptor, sizes(0), sizes(1)));
 }

예제 #3

0

파일 보기

파일: Cuda.hpp 프로젝트: mikeroberts3000/NeuralProcessReconstructionFromSparseUserScribbles

inline cudaArray* MallocArray2D< float2 >( VolumeDescription volumeDescription )
{
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc( 32, 32, 0, 0, cudaChannelFormatKindFloat ); 

    cudaArray* cuArray; 
    MOJO_CUDA_SAFE( cudaMallocArray( &cuArray, &channelDesc, volumeDescription.numVoxels.x, volumeDescription.numVoxels.y ) ); 

    return cuArray;
}

예제 #4

0

파일 보기

파일: main.cpp 프로젝트: chuong/fusibile

static void addImageToTextureUint (vector<Mat_<uint8_t> > &imgs, cudaTextureObject_t texs[])
{
    for (unsigned int i=0; i<imgs.size(); i++)
    {
        int rows = imgs[i].rows;
        int cols = imgs[i].cols;
        // Create channel with uint8_t point type
        cudaChannelFormatDesc channelDesc =
        //cudaCreateChannelDesc (8,
        //0,
        //0,
        //0,
        //cudaChannelFormatKindUnsigned);
        cudaCreateChannelDesc<char>();
        // Allocate array with correct size and number of channels
        cudaArray *cuArray;
        checkCudaErrors(cudaMallocArray(&cuArray,
                                        &channelDesc,
                                        cols,
                                        rows));

        checkCudaErrors (cudaMemcpy2DToArray (cuArray,
                                              0,
                                              0,
                                              imgs[i].ptr<uint8_t>(),
                                              imgs[i].step[0],
                                              cols*sizeof(uint8_t),
                                              rows,
                                              cudaMemcpyHostToDevice));

        // Specify texture
        struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType         = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;

        // Specify texture object parameters
        struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModePoint;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 0;

        // Create texture object
        //cudaTextureObject_t &texObj = texs[i];
        checkCudaErrors(cudaCreateTextureObject(&(texs[i]), &resDesc, &texDesc, NULL));
        //texs[i] = texObj;
    }
    return;
}

예제 #5

0

파일 보기

파일: utils.cpp 프로젝트: sunwj/SunPathTracer-CUDA-Path-tracer

cudaTextureObject_t create_environment_light_texture(const std::string& filename)
{
    int w = 0, h = 0, n = 0;
    float* data = stbi_loadf(filename.c_str(), &w, &h, &n, 0);
    if(!data)
    {
        std::cerr<<"Unable to load environment map: "<<filename<<std::endl;
        exit(0);
    }

    //create channel desc
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
    //create cudaArray
    cudaArray* array;
    checkCudaErrors(cudaMallocArray(&array, &channelDesc, w, h));
    if(n == 3)
    {
        uint32_t count = w * h;
        std::vector<float4> ext_data;
        ext_data.reserve(count);
        for(auto i = 0; i < count; ++i)
            ext_data.push_back(make_float4(data[i * 3], data[i * 3 + 1], data[i * 3 + 2], 0.f));

        checkCudaErrors(cudaMemcpyToArray(array, 0, 0, ext_data.data(), sizeof(float4) * w * h, cudaMemcpyHostToDevice));
    }
    else
        checkCudaErrors(cudaMemcpyToArray(array, 0, 0, data, sizeof(float4) * w * h, cudaMemcpyHostToDevice));
    //create resource desc
    cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = array;
    //create texture desc
    cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.addressMode[0] = cudaAddressModeWrap;
    texDesc.addressMode[1] = cudaAddressModeWrap;
    texDesc.filterMode = cudaFilterModeLinear;
    texDesc.readMode = cudaReadModeElementType;
    texDesc.normalizedCoords = true;
    //create cudaTextureObject
    cudaTextureObject_t tex;
    checkCudaErrors(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));

    return tex;
}

예제 #6

0

파일 보기

파일: main.cpp 프로젝트: chuong/fusibile

static void addImageToTextureFloatColor (vector<Mat > &imgs, cudaTextureObject_t texs[])
{
    for (size_t i=0; i<imgs.size(); i++)
    {
        int rows = imgs[i].rows;
        int cols = imgs[i].cols;
        // Create channel with floating point type
        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();

        // Allocate array with correct size and number of channels
        cudaArray *cuArray;
        checkCudaErrors(cudaMallocArray(&cuArray,
                                        &channelDesc,
                                        cols,
                                        rows));

        checkCudaErrors (cudaMemcpy2DToArray (cuArray,
                                              0,
                                              0,
                                              imgs[i].ptr<float>(),
                                              imgs[i].step[0],
                                              cols*sizeof(float)*4,
                                              rows,
                                              cudaMemcpyHostToDevice));

        // Specify texture
        struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType         = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;

        // Specify texture object parameters
        struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 0;

        // Create texture object
        checkCudaErrors(cudaCreateTextureObject(&(texs[i]), &resDesc, &texDesc, NULL));
    }
    return;
}

예제 #7

0

파일 보기

파일: CudaFloatTexture1D.cpp 프로젝트: marekolsak/fastgrid

CudaFloatTexture1D::CudaFloatTexture1D(int width, const double *data, CudaAction action, cudaStream_t stream, CudaInternalAPI *api)
{
    channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

    // Allocate the texture on the GPU...
    CUDA_SAFE_CALL(cudaMallocArray(&deviceArray, &channelDesc, width, 1));
    // ... and in page-locked system memory
    CUDA_SAFE_CALL(cudaMallocHost((void**)&hostMem, sizeof(float) * width));

    // Convert doubles to floats and save them to page-locked system memory
    std::transform(data, data + width, hostMem, typecast<float, double>);

    // Copy floats from the page-locked memory to the GPU
    CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(deviceArray, 0, 0, hostMem, sizeof(float) * width, cudaMemcpyHostToDevice, stream));

    if (action == BindToKernel)
        api->setDistDepDielTexture(deviceArray, &channelDesc);
}

예제 #8

0

파일 보기

파일: costVol_chamo.cpp 프로젝트: ziliwangmoe/DTAM

void costVol_chamo::updataCV(Mat refImg, Mat projMat, float weightPerImg){
	
	
	cudaArray* cuArray;
	cudaTextureObject_t texObj;
	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
	cudaSafeCall(cudaMallocArray(&cuArray, &channelDesc, width, height));

	cudaMemcpyToArray(cuArray, 0, 0, refImg.data, width*height*sizeof(float), cudaMemcpyHostToDevice);
	cudaSafeCall(cudaGetLastError());

	struct cudaResourceDesc resDesc;
	memset(&resDesc, 0, sizeof(resDesc));
	resDesc.resType = cudaResourceTypeArray;
	resDesc.res.array.array = cuArray;
	struct cudaTextureDesc texDesc;
	memset(&texDesc, 0, sizeof(texDesc));
	texDesc.addressMode[0] = cudaAddressModeClamp;
	texDesc.addressMode[1] = cudaAddressModeClamp;
	texDesc.filterMode = cudaFilterModeLinear;
	texDesc.readMode = cudaReadModeNormalizedFloat;
	texDesc.normalizedCoords = 0;
	cudaSafeCall(cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL));

	Mat finalTran = projMat*baseImgProjMat.inv();
	cvInput input;
	input.baseImg = (float *)baseImg.data;
	input.cvData = (float*)cvData.data;
	input.nearZ = nearZ;
	input.farZ = farZ;
	input.height = height;
	input.width = width;
	input.lowInd = (float*)lowInd.data;
	input.lowValue = (float*)bottonVal.data;
	for (int i = 0; i < 12; i++){
		input.transMat[i] = finalTran.at<float>(i);
	}
	input.refImg = texObj;
	input.zStep = (nearZ - farZ) / layers;
	input.stepCount = layers;
	updataCount++;
	input.weightPerImg = 1.0 / updataCount;
	updataCVCaller(input);
}

예제 #9

0

파일 보기

파일: vtest_pointergetattributes.cpp 프로젝트: ckennelly/panoptes

TEST(PointerGetAttributes, Array) {
    struct cudaArray * ary;
    cudaError_t ret;

    struct cudaChannelFormatDesc dsc;
    dsc.x = dsc.y = dsc.z = dsc.w = 8;
    dsc.f = cudaChannelFormatKindSigned;

    int device;
    ret = cudaGetDevice(&device);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMallocArray(&ary, &dsc, 1, 1, 0);
    ASSERT_EQ(cudaSuccess, ret);

    struct cudaPointerAttributes attr;
    ret = cudaPointerGetAttributes(&attr, ary);
    EXPECT_EQ(cudaErrorInvalidValue, ret);

    ret = cudaFreeArray(ary);
    ASSERT_EQ(cudaSuccess, ret);
}

예제 #10

0

파일 보기

파일: main.cpp 프로젝트: chengli1986/571e

////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    float
    *h_Kernel,
    *h_Input,
    *h_Buffer,
    *h_OutputCPU,
    *h_OutputGPU;

    cudaArray
    *a_Src;

    cudaChannelFormatDesc floatTex = cudaCreateChannelDesc<float>();

    float
    *d_Output;

    float
    gpuTime;

    StopWatchInterface *hTimer = NULL;

    const int imageW = 3072;
    const int imageH = 3072 / 2;
    const unsigned int iterations = 10;

    printf("[%s] - Starting...\n", argv[0]);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    findCudaDevice(argc, (const char **)argv);

    sdkCreateTimer(&hTimer);

    printf("Initializing data...\n");
    h_Kernel    = (float *)malloc(KERNEL_LENGTH * sizeof(float));
    h_Input     = (float *)malloc(imageW * imageH * sizeof(float));
    h_Buffer    = (float *)malloc(imageW * imageH * sizeof(float));
    h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float));
    h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float));
    checkCudaErrors(cudaMallocArray(&a_Src, &floatTex, imageW, imageH));
    checkCudaErrors(cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float)));

    srand(2009);

    for (unsigned int i = 0; i < KERNEL_LENGTH; i++)
    {
        h_Kernel[i] = (float)(rand() % 16);
    }

    for (unsigned int i = 0; i < imageW * imageH; i++)
    {
        h_Input[i] = (float)(rand() % 16);
    }

    setConvolutionKernel(h_Kernel);
    checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice));


    printf("Running GPU rows convolution (%u identical iterations)...\n", iterations);
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    for (unsigned int i = 0; i < iterations; i++)
    {
        convolutionRowsGPU(
            d_Output,
            a_Src,
            imageW,
            imageH
        );
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations;
    printf("Average convolutionRowsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime));

    //While CUDA kernels can't write to textures directly, this copy is inevitable
    printf("Copying convolutionRowGPU() output back to the texture...\n");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToDevice));
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer);
    printf("cudaMemcpyToArray() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime));

    printf("Running GPU columns convolution (%i iterations)\n", iterations);
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    for (int i = 0; i < iterations; i++)
    {
        convolutionColumnsGPU(
            d_Output,
            a_Src,
            imageW,
            imageH
        );
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations;
    printf("Average convolutionColumnsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime));

    printf("Reading back GPU results...\n");
    checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost));

    printf("Checking the results...\n");
    printf("...running convolutionRowsCPU()\n");
    convolutionRowsCPU(
        h_Buffer,
        h_Input,
        h_Kernel,
        imageW,
        imageH,
        KERNEL_RADIUS
    );

    printf("...running convolutionColumnsCPU()\n");
    convolutionColumnsCPU(
        h_OutputCPU,
        h_Buffer,
        h_Kernel,
        imageW,
        imageH,
        KERNEL_RADIUS
    );

    double delta = 0;
    double sum = 0;

    for (unsigned int i = 0; i < imageW * imageH; i++)
    {
        sum += h_OutputCPU[i] * h_OutputCPU[i];
        delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]);
    }

    double L2norm = sqrt(delta / sum);
    printf("Relative L2 norm: %E\n", L2norm);
    printf("Shutting down...\n");

    checkCudaErrors(cudaFree(d_Output));
    checkCudaErrors(cudaFreeArray(a_Src));
    free(h_OutputGPU);
    free(h_Buffer);
    free(h_Input);
    free(h_Kernel);

    sdkDeleteTimer(&hTimer);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    if (L2norm > 1e-6)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed\n");
    exit(EXIT_SUCCESS);
}

예제 #11

0

파일 보기

파일: GpuBinaryImageAlgorithm.hpp 프로젝트: samkreter/cvtile

ErrorCode GpuBinaryImageAlgorithm<InputPixelType, InputBandCount, OutputPixelType, OutputBandCount>::initializeDevice()
{
	/*
	 * Attempts to check the GPU and begin warm up
	 *
	 */
	this->lastError = this->setGpuDevice();
	if(this->lastError)
		return this->lastError;
	
	if (this->properties.getMajorCompute() < 1)
	{
		this->lastError = InitFailNoCUDA;
		return this->lastError;
	}
	/*
	 * Verfies the properities of GPU
	 *
	 */
	cudaStreamCreate(&this->stream);
	cudaError cuer = cudaGetLastError();
	if(cuer != cudaSuccess){
		this->lastError = InitFailcuStreamCreateErrorcudaErrorInvalidValue;
		return this->lastError;
	}

	//Set descriptor of input data before allocation
	// Sets is at single channel, 16-bit unsigned integer
	std::string inTypeIdentifier(typeid(this->tempForTypeTesting).name());
	size_t bitDepth = 0;
	cudaChannelFormatDesc inputDescriptor;

	if(inTypeIdentifier == "a" || 
	   inTypeIdentifier == "s" || 
	   inTypeIdentifier == "i" ||
	   inTypeIdentifier == "l")
	{
		this->channelType = cudaChannelFormatKindSigned;
	}
	else if(inTypeIdentifier == "h" || 
			inTypeIdentifier == "t" || 
			inTypeIdentifier == "j" || 
			inTypeIdentifier == "m")
	{
		this->channelType = cudaChannelFormatKindUnsigned;
	}
	else if(inTypeIdentifier == "f" || 
			inTypeIdentifier == "d") 
	{
		this->channelType = cudaChannelFormatKindFloat;
	}
	else
	{
		this->lastError = InitFailUnsupportedInputType;
		return this->lastError;
	}

	bitDepth = sizeof(this->tempForTypeTesting) * 8;

	inputDescriptor = cudaCreateChannelDesc(bitDepth, 0, 0, 0, this->channelType);
	cuer = cudaGetLastError();
	
	if (cuer != cudaSuccess) {
		this->lastError = CudaError;
		std::cout << "CUDA ERR = " << cuer << std::endl;
		throw std::runtime_error("GPU WHS INIT FAILED TO CREATE CHANNEL");
	}	


	if(cuer != cudaSuccess){
		this->lastError = CudaError;
		return this->lastError;
	}	
	
	//////////////////////////////////////////////////////////
	// ALLOCATE MEMORY FOR GPU INPUT AND OUTPUT DATA (TILE) //
	/////////////////////////////////////////////////////////
	
	cuer = cudaGetLastError();
	/*Gpu Input Data*/
	cudaMallocArray(
					(cudaArray**)&this->gpuInputDataArray,   
					 &inputDescriptor, 
					 this->dataSize.width,  
					 this->dataSize.height
					);
	this->gpuInput = this->gpuInputDataArray;

	cuer = cudaGetLastError();
	if (cuer != cudaSuccess) {
		std::cout << "CUDA ERR = " << cuer << std::endl;
		throw std::runtime_error("GPU WHS INIT FAILED TO ALLOCATE MEMORY");
	}
	this->usingTexture = true;	
	//Gpu Output Data 
	const size_t bytes = this->dataSize.width * this->dataSize.height * OutputBandCount * sizeof(OutputPixelType);
	this->outputDataSize = bytes;
	cudaMalloc((void**) &this->gpuOutputData, bytes);
	cuer = cudaGetLastError();

	if (cuer != cudaSuccess) {
		throw new std::runtime_error("GPU WHS INIT FAILED TO ALLOCATE OUTPUT MEMORY");
	}
	if (cuer == cudaErrorMemoryAllocation)
	{
		this->lastError = InitFailcuOutArrayMemErrorcudaErrorMemoryAllocation;
		return this->lastError;
	}
	//////////////////////////////////////////////////////////////////////////////////////
	// CALL FUNCTION TO ALLOCATE ADDITIONAL GPU STORAGE - DOES NOTHING IF NOT OVERRIDEN //
	/////////////////////////////////////////////////////////////////////////////////////
	/* Initialize the neighborhood coordinates */
	/*uses two ints to store width and height coords by the windowRadius_*/	
	/*
	 * Allocates the memory needed for the results coming back from the GPU
	 *
	 */
	this->lastError = this->allocateAdditionalGpuMemory();
	return this->lastError;
}

예제 #12

0

파일 보기

파일: GpuBinaryImageAlgorithm.hpp 프로젝트: samkreter/cvtile

ErrorCode GpuBinaryImageAlgorithm<InputPixelType, InputBandCount, OutputPixelType, OutputBandCount>::operator()(const cvt::cvTile<InputPixelType>& tile,
					const cvt::cvTile<InputPixelType> &tile2,const cvt::cvTile<OutputPixelType> ** outTile)
{
		//TO-DO Error Check Template Params for Type/Bounds

	const cv::Size2i tileSize = tile.getSize();
	
	if (tileSize != this->dataSize)
	{
		std::stringstream ss;
		ss << tileSize << " expected of " << this->dataSize << std::endl; 
		throw std::runtime_error(ss.str());
	}

	if (tileSize != tile2.getSize()) {
		throw std::runtime_error("Both the incoming tiles must have different sizes");
	}

	/*
	 *  Copy data down for tile using the parents implementation
	 */
	this->lastError = this->copyTileToDevice(tile);
	if (this->lastError != cvt::Ok)
	{
		throw std::runtime_error("Failed to copy tile to device");
	}
	std::string inTypeIdentifier(typeid(this->tempForTypeTesting).name());
	size_t bitDepth = 0;
	cudaChannelFormatDesc inputDescriptor;

	if(inTypeIdentifier == "a" || 
	   inTypeIdentifier == "s" || 
	   inTypeIdentifier == "i" ||
	   inTypeIdentifier == "l")
	{
		this->channelType = cudaChannelFormatKindSigned;
	}
	else if(inTypeIdentifier == "h" || 
			inTypeIdentifier == "t" || 
			inTypeIdentifier == "j" || 
			inTypeIdentifier == "m")
	{
		this->channelType = cudaChannelFormatKindUnsigned;
	}
	else if(inTypeIdentifier == "f" || 
			inTypeIdentifier == "d") 
	{
		this->channelType = cudaChannelFormatKindFloat;
	}
	else
	{
		this->lastError = InitFailUnsupportedInputType;
		return this->lastError;
	}

	bitDepth = sizeof(this->tempForTypeTesting) * 8;

	cudaError cuer;
	inputDescriptor = cudaCreateChannelDesc(bitDepth, 0, 0, 0, this->channelType);
	cuer = cudaGetLastError();
	
	if (cuer != cudaSuccess) {
		this->lastError = CudaError;
		std::cout << "CUDA ERR = " << cuer << std::endl;
		throw std::runtime_error("GPU BINARY IMAGE RUN FAILED TO CREATE CHANNEL");
	}

	cudaChannelFormatDesc input_descriptor = cudaCreateChannelDesc(bitDepth, 0, 0, 0, cudaChannelFormatKindUnsigned);
	cudaMallocArray((cudaArray**)&gpuInputDataArrayTwo_, &input_descriptor, this->dataSize.width, this->dataSize.height);
	const unsigned int offsetX = 0;
	const unsigned int offsetY = 0;
	const unsigned char* tile_data_ptr = &(tile2[0].data[0]);
	const unsigned int tileArea = tile.getSize().area();

	cudaMemcpyToArrayAsync(gpuInputDataArrayTwo_,	// the device | destination address
			offsetX , offsetY,  			// the destination offsets
			tile_data_ptr,					// the host | source address
			sizeof(InputPixelType) * tileArea,		// the size of the copy in bytes
			cudaMemcpyHostToDevice,			// the type of the copy
			this->stream);						// the device command stream

	cuer = cudaGetLastError();
	if (cuer != cudaSuccess) {
		std::cout << "CUDA ERR = " << cuer << std::endl;
		throw std::runtime_error("GPU BINARY IMAGE RUN FAILED TO ALLOCATE MEMORY");
	}

	// Invoke kernel with empirically chosen block size
	unsigned short bW = 16;
	unsigned short bH = 16;

	launchKernel(bW, bH);

	this->lastError = this->copyTileFromDevice(outTile);
	if(this->lastError != cvt::Ok) {
		std::runtime_error("Failed copy off tile from device");
	}
	return Ok;
}

예제 #13

0

파일 보기

파일: cudaImagePyramid.cpp 프로젝트: Epsirom/Stylization

void CudaImagePyramidHost::initialize(int width, int height, cudaTextureFilterMode filter_mode, int depth)
{
    qDebug() << "pyramid host initializing with params: " << width << height << filter_mode << depth;
    if (isInitialized() && width == _baseWidth && height == _baseHeight && filter_mode == _filterMode) {
        return;
    }

    clear();
    qDebug() << "Clear done.";

    _baseWidth = width;
    _baseHeight = height;
    _filterMode = filter_mode;
    _numLayers = depth;

    // Get the texture and its channel descriptor to allocate the storage.
    const textureReference* constTexRefPtr=NULL;
    cudaGetTextureReference(&constTexRefPtr, _texture_name);
    qDebug() << "Texture Ref got:" << _name;
    if (constTexRefPtr == 0)
    {
        qDebug() << "constTexRefPtr==0";
    }
    checkCUDAError("Can't get tex ref for init TEXTURE_PYRAMID", _name);
    cudaChannelFormatDesc formatDesc = constTexRefPtr->channelDesc;

    if(_textureType == cudaTextureType2DLayered){
        cudaDeviceProp prop;
        qDebug() << "to get CUDA device prop";
        cudaGetDeviceProperties(&prop,0);
        qDebug() << "CUDA Device Prop got";
        if(prop.maxTexture2DLayered[0] < _baseWidth || prop.maxTexture2DLayered[1] < _baseHeight || prop.maxTexture2DLayered[2] < _numLayers){
            qDebug()<< "Max layered texture size:" << prop.maxTexture2DLayered[0] << " x " << prop.maxTexture2DLayered[1] << " x " << prop.maxTexture2DLayered[2];
            assert(0);
        }
        cudaExtent extent = make_cudaExtent(_baseWidth, _baseHeight, _numLayers);
        cudaMalloc3DArray(&_storage, &formatDesc, extent, cudaArrayLayered);
    }else{
        cudaMallocArray(&_storage, &formatDesc, _baseWidth, _baseHeight);
    }
    checkCUDAError("Failure to allocate", _name);

    qDebug() << "allocate done";

    // Set texture parameters.
    // Evil hack to get around an apparent bug in the cuda api:
    // cudaGetTextureReference only returns a const reference, and
    // there is no way to set the parameters with a reference other
    // than cast it to non-const.
    textureReference* texRefPtr=NULL;
    texRefPtr = const_cast<textureReference*>( constTexRefPtr );
    texRefPtr->addressMode[0] = cudaAddressModeClamp;
    texRefPtr->addressMode[1] = cudaAddressModeClamp;
    texRefPtr->filterMode = filter_mode;
    texRefPtr->normalized = false; // Use unnormalized (pixel) coordinates for addressing. This forbids texture mode wrap.

    bindTexture();

    qDebug() << "texture binded";

    bool found = false;
    for (size_t i = 0; i < _instances.size(); i++) {
        if (_instances[i] == this)
            found = true;
    }
    if (!found) {
        qDebug() << "Not found";
        _instances.push_back(this);
    }
    qDebug() << "paramid host initialized.";
}

예제 #14

0

파일 보기

파일: cudart.c 프로젝트: Shelnutt2/cuda-wine-wrapper

cudaError_t WINAPI wine_cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, unsigned int flags) {
    WINE_TRACE("\n");
    return cudaMallocArray( array, desc, width, height, flags);
}

예제 #15

0

파일 보기

파일: cudart.c 프로젝트: sebastic/cuda-wine-wrapper

cudaError_t WINAPI wine_cudaMallocArray( struct cudaArray** array, const struct cudaChannelFormatDesc* desc, size_t width, size_t height ){
	WINE_TRACE("\n");
	return cudaMallocArray( array, desc, width, height);
}

예제 #16

0

파일 보기

파일: layer.cpp 프로젝트: s271/cuMagic

void InitCudaLayers()
{

	mmGridSizeX = sim_width/blockSizex;
	mmGridSizeY = sim_height/blockSizey;
	mmGridSize = mmGridSizeX*mmGridSizeY;
	memset(mmGrid, 0, sizeof(mmGrid));
	memset(mmYGGrid, 0, sizeof(mmYGGrid));

	tempHostData = (float*)malloc(sim_width*sim_height*TEMP_HOST_ELEM*sizeof(float));
	tempHostDataNoCuda = (float*)malloc(sim_width*sim_height*TEMP_HOST_ELEM*sizeof(float));
	grid8ValTick = (float*)malloc(sim_width*sim_height*8*sizeof(float));

	initColors();

	memset(gCudaLayer, 0, sizeof(gCudaLayer));
	memset(gCudaFuncLayer, 0, sizeof(gCudaFuncLayer));
	memset(gPhysLayer, 0, sizeof(gPhysLayer));
	memset(gStateLayer, 0, sizeof(gStateLayer));

	srand(0);
	int seed = rand();

	const cudaChannelFormatDesc desc4 = cudaCreateChannelDesc<float4>();
	cudaMallocArray(&gCudaVectArray, &desc4, sim_width, sim_height);
#if NFLAYERS ==2
	const cudaChannelFormatDesc desc2 = cudaCreateChannelDesc<float2>();
#else if NFLAYERS ==4
	const cudaChannelFormatDesc descF = desc4;
#endif
	cudaMallocArray(&gCudaFlArray, &descF, sim_width, sim_height);

	const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
	cudaMallocArray(&gCudaFuncWavePack, &desc, sim_width);
	cudaMallocArray(&gCudaFuncSmooth, &desc, sim_width);

	cudaMallocArray(&(gCudaLayer[0]), &desc, sim_width, sim_height);
	cudaMallocArray(&(gCudaLayer[1]), &desc, sim_width, sim_height);

	cudaMallocArray(&(gCudaFuncLayer[0]), &desc, sim_width, sim_height);

	cudaMalloc(&cuTempData, TEMP_SIZE*sizeof(float)*sim_width*sim_height);
	cudaMalloc(&cuRandArr, sizeof(unsigned int)*sim_width*sim_height);

	cudaMalloc(&gStateLayer[0], sim_rect*sizeof(float));
	cudaMemset(gStateLayer[0], 0, sim_rect*sizeof(float));
	cudaMalloc(&gStateLayer[1], sim_rect*sizeof(float));
	cudaMemset(gStateLayer[1], 0, sim_rect*sizeof(float));

	cudaMalloc(&gPhysLayer[0], sim_rect*sizeof(float));
	cudaMemset(gPhysLayer[0], 0, sim_rect*sizeof(float));
	cudaMalloc(&gPhysLayer[1], sim_rect*sizeof(float));
	cudaMemset(gPhysLayer[1], 0, sim_rect*sizeof(float));

	cudaMalloc(&gRedBlueField, NFLAYERS*sim_rect*sizeof(float));
	cudaMemset(gRedBlueField, 0, NFLAYERS*sim_rect*sizeof(float));

	size_t pitch = 4*sim_width*sizeof(float);
	cudaMallocPitch((void**)&gVectorLayer, &pitch, 4*sim_width*sizeof(float), sim_height);

	cudaMemset2D(gVectorLayer, 4*sim_width*sizeof(float), 0, 4*sim_width*sizeof(float), sim_height);	

	InitWavePack(32, 1.f, sim_width, sim_height, cuTempData, gCudaFuncWavePack);

	InitSmooth(1, sim_width, cuTempData, gCudaFuncSmooth);

	InitRnd2DInt(seed, cuRandArr, sim_width, sim_height);

	InitFuncLayer(gCudaFuncLayer[0], cuTempData, sim_width, sim_height);

	InitPhysLayer(gPhysLayer[0], gStateLayer[0], cuRandArr, sim_width, sim_height);

	float* gridIni = cuTempData+3*sim_rect/2;
	float* halfTemp = cuTempData + sim_rect;
	float* out = cuTempData + 2*sim_rect;
	cudaMemset(out, 0, sim_rect*sizeof(float));

	seed = rand();
	int gridx = INTERP_SIZEX;
	int gridy = INTERP_SIZEX;
	InitRnd2DF(seed, gridIni, gridx, gridy);
	float scaleadd = .7f;
	Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height);

	seed = rand();
	gridx = (int)(gridx*2);
	gridy = (int)(gridy*2);
	InitRnd2DF(seed, gridIni, gridx, gridy);
	scaleadd = .3f;
	Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height);


	cudaMemcpyToArray(gCudaLayer[0], 0, 0, out, sizeof(float)*sim_rect, cudaMemcpyDeviceToDevice);

	cudaMemset(out, 0, sim_rect*sizeof(float));
	gridx = INTERP_SIZEX;
	gridy = INTERP_SIZEX;

	seed = rand();
	InitRnd2DF(seed, gridIni, gridx, gridy);
	scaleadd = .7f;
	Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height);

	seed = rand();
	gridx = (int)(gridx*1.5);
	gridy = (int)(gridy*1.5);
	InitRnd2DF(seed, gridIni, gridx, gridy);
	scaleadd = .3f;
	Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height);

	cudaMemcpyToArray(gCudaLayer[1], 0, 0, out, sizeof(float)*sim_rect, cudaMemcpyDeviceToDevice);

	float2 pos0;
	pos0.x = gObj0X;
	pos0.y = gObj0Y;

	float2 pos1;
	pos1.x = gObj1X;
	pos1.y = gObj1Y;

	gObjInertia.Init(pos0, pos1);

	LayerProc(sim_width, sim_height, gCudaLayer[0], gCudaFuncLayer[0], cuTempData, pos0.x , pos0.y, pos1.x , pos1.y);
	ParticleStateInit(cuTempData, cuRandArr, 
					   gStateLayer[0], gPhysLayer[0], gRedBlueField);

	InitBhv();

}