Example #1
0
CUresult loadCUDAModules()
{
    CUmodule cuModule_;
    checkCudaErrors(cuModuleLoad(&cuModule_, "videoPP64.ptx"));
    checkCudaErrors(cuModuleGetFunction(&g_kernelNV12toARGB, cuModule_, "NV12ToARGBdrvapi"));
    checkCudaErrors(cuModuleGetFunction(&g_kernelARGBtoNV12, cuModule_, "ARGBToNv12drvapi"));
    checkCudaErrors(cuModuleGetFunction(&g_kernelARGBpostprocess, cuModule_, "ARGBpostprocess"));
}
int main(int argc, char** argv)
{

	float fTotalTime = 0;

// 	int TARGET_WIDTH=atoi(argv[2]);
// 	int TARGET_HEIGHT=atoi(argv[3]);
// 	bool visualize_results=atoi(argv[4]);
// 	unsigned int kernel_size=atoi(argv[2]);
 	int gpuNr=atoi(argv[2]);
 	checkCudaErrors(cudaSetDevice(gpuNr));

	
	IplImage* gray_image = cvLoadImage(argv[1],CV_LOAD_IMAGE_GRAYSCALE);
	unsigned char * d_input_image;
	unsigned char * d_output_image;
	int widthImage=gray_image->width;
	int heightImage=gray_image->height;
	
	IplImage *output_image = cvCreateImage(cvSize(widthImage,heightImage), IPL_DEPTH_8U, 1);
	for( int i=0;i<heightImage;i++)
	  for( int j=0;j<widthImage;j++)
	    output_image->imageData[i*widthImage+j]=255;
	
	  	  unsigned int * d_histogram;
	int total_threads=256;	  
	  cudaMalloc(&d_histogram,sizeof(unsigned int)*256*total_threads);
	checkCudaErrors(cudaMalloc(&d_input_image,widthImage*heightImage*sizeof(unsigned char)));  
	checkCudaErrors(cudaMalloc(&d_output_image,widthImage*heightImage*sizeof(unsigned char)));
	checkCudaErrors(cudaMemcpy(d_input_image,gray_image->imageData,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyHostToDevice));
	unsigned int windows_array[4]={15,17,25,31};
	int total_implementations=4;
	double elapsed_time;
	for (int i=1;i<=total_implementations;i++)
	{
	  for( int j=0;j<4;j++)
	  {
	timer my_timer;  
	MedianFilterUcharCUDA(d_input_image,d_output_image,d_histogram,widthImage,heightImage,windows_array[j],16,16,i);
	cudaThreadSynchronize();
	elapsed_time=my_timer.elapsed();
	printf("elapsed_time for implementation %d for window size %d was %f \n",i,windows_array[j],elapsed_time);
	  }
	}
	timer array_timer;
	arrayFireRows(d_input_image,d_output_image,widthImage,heightImage,3,16,16);
		cudaThreadSynchronize();
	elapsed_time=array_timer.elapsed();
	printf("elapsed_time for array fire was %f \n",elapsed_time);
	checkCudaErrors(cudaMemcpy(output_image->imageData,d_output_image,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyDeviceToHost));
// 	_medianfilter((unsigned char *)gray_image->imageData, (unsigned char *)output_image->imageData, widthImage, heightImage);
	
	cvSaveImage("output.jpg",output_image);
	



}
Example #3
0
void mmf::OptSO3MMFvMF::init()
{
  std::cout << "mmf::OptSO3MMFvMF::init()" << std::endl;
  std::cout << 3*6*K() << std::endl;
  checkCudaErrors(cudaMalloc((void **)&d_cost, K()*6*sizeof(float)));
  checkCudaErrors(cudaMalloc((void **)&d_mu_, K()*6*3*sizeof(float)));
  checkCudaErrors(cudaMalloc((void **)&d_N_, sizeof(int)));
  loadRGBvaluesForMFaxes();
};
/*
 * This function load the ptx file ptxPath and extract the kernel kName
 * to phKernel
 * @param phKernel    Output kernel handle
 * @param ptxPath     ptx file name
 * @param kName       kernel name
 */
void ptxJIT(CUmodule *phModule, CUfunction *phKernel, const char *ptxPath, const char *kName)
{
  CUlinkState cuLinkState;
  CUjit_option options[6];
  void *optionVals[6];
  float walltime;
  char error_log[8192], info_log[8192];
  unsigned int logSize = 8192;
  void *cuOut;
  size_t outSize;
  int myErr = 0;

  // Setup linker options
  // Return walltime from JIT compilation
  options[0] = CU_JIT_WALL_TIME;
  optionVals[0] = (void *) &walltime;
  // Pass a buffer for info messages
  options[1] = CU_JIT_INFO_LOG_BUFFER;
  optionVals[1] = (void *) info_log;
  // Pass the size of the info buffer
  options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
  optionVals[2] = (void *) (long)logSize;
  // Pass a buffer for error message
  options[3] = CU_JIT_ERROR_LOG_BUFFER;
  optionVals[3] = (void *) error_log;
  // Pass the size of the error buffer
  options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  optionVals[4] = (void *) (long) logSize;
  // Make the linker verbose
  options[5] = CU_JIT_LOG_VERBOSE;
  optionVals[5] = (void *) 1;

  // Create a pending linker invocation
  checkCudaErrors(cuLinkCreate(6,options, optionVals, &cuLinkState));

  // Load the ptx from the file
  myErr = cuLinkAddFile(cuLinkState, CU_JIT_INPUT_PTX, ptxPath, 0, 0, 0);
  if (myErr != CUDA_SUCCESS){
    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
    fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
  }

  // Complete the linker step
  checkCudaErrors(cuLinkComplete(cuLinkState, &cuOut, &outSize));

  // Linker walltime and info_log were requested in options above.
  printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime, info_log);

  // Load resulting cuBin into module
  checkCudaErrors(cuModuleLoadData(phModule, cuOut));

  // Locate the kernel entry point
  checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, kName));

  // Destroy the linker invocation
  checkCudaErrors(cuLinkDestroy(cuLinkState));
}
Example #5
0
void displayFunc(void)
{
    sdkStartTimer(&timer);
    TColor *d_dst = NULL;
    size_t num_bytes;

    if (frameCounter++ == 0)
    {
        sdkResetTimer(&timer);
    }

    // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, gl_PBO));
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    getLastCudaError("cudaGraphicsMapResources failed");
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource));
    getLastCudaError("cudaGraphicsResourceGetMappedPointer failed");

    checkCudaErrors(CUDA_Bind2TextureArray());

    runImageFilters(d_dst);

    checkCudaErrors(CUDA_UnbindTexture());
    // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO));
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    // Common display code path
    {
        glClear(GL_COLOR_BUFFER_BIT);

        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0));
        glBegin(GL_TRIANGLES);
        glTexCoord2f(0, 0);
        glVertex2f(-1, -1);
        glTexCoord2f(2, 0);
        glVertex2f(+3, -1);
        glTexCoord2f(0, 2);
        glVertex2f(-1, +3);
        glEnd();
        glFinish();
    }

    if (frameCounter == frameN)
    {
        frameCounter = 0;

        if (g_FPS)
        {
            printf("FPS: %3.1f\n", frameN / (sdkGetTimerValue(&timer) * 0.001));
            g_FPS = false;
        }
    }

    glutSwapBuffers();

    sdkStopTimer(&timer);
    computeFPS();
}
Example #6
0
void Mandelbrot::WriteBuffer() {

	checkCudaErrors( cudaGLMapBufferObject( ( void** ) &this->devArray, this->buffer ), __LINE__, false );

	cudaMemcpy( this->devArray, this->devCalcArray, this->iSize, cudaMemcpyDeviceToDevice );

	checkCudaErrors( cudaGLUnmapBufferObject( this->buffer ), __LINE__, false );

	this->bIsFlushed = true;
}
Example #7
0
BaseData<Dtype>::~BaseData()
{
	if (cpu_data_ != NULL){
		checkCudaErrors(cudaFreeHost(cpu_data_));
	}

	if (gpu_data_ != NULL){
		checkCudaErrors(cudaFree(gpu_data_));
	}
}
Example #8
0
void Renderer::render(const Camera& camera, float time) {

	// calc cam vars
  glm::vec3 A,B,C;
  {
    // camera ray
    C = glm::normalize(camera.getLookAt()-camera.getPosition());

    // calc A (screen x)
    // calc B (screen y) then scale down relative to aspect
    // fov is for screen x axis
    A = glm::normalize(glm::cross(C,camera.getUp()));
    B = 1.0f/camera.getAspect()*glm::normalize(glm::cross(A,C));

    // scale by FOV
    float tanFOV = tan(glm::radians(camera.getFOV()));
    A *= tanFOV;
    B *= tanFOV;
  }

  // cuda call
  unsigned int* out_data;
	checkCudaErrors(cudaGLMapBufferObject((void**)&out_data, pbo));
  
  if (mode == RAYTRACE) {
    raytrace1(out_data, image_width, image_height, time,
      camera.getPosition(), A, B, C,
      scene_d, sceneSize);
  }
  else if (mode == PATHTRACE) {
    ++filmIters;

    pathtrace(out_data, image_width, image_height, time,
      camera.getPosition(), A, B, C,
      camera.m_lensRadius, camera.m_focalDist,
      scene_d, sceneSize,
      rand_d, rays_d, col_d, idx_d,
      film_d, filmIters);
  }

	checkCudaErrors(cudaGLUnmapBufferObject(pbo));

	// download texture from destination PBO
	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
  glActiveTexture(GL_TEXTURE0 + RENDER_TEXTURE);
	glBindTexture(GL_TEXTURE_2D, result_texture);
	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
  glActiveTexture(GL_TEXTURE0 + UNUSED_TEXTURE);

	SDK_CHECK_ERROR_GL();

  
  fullScreenQuad.display();
}
Example #9
0
void Canvas::paintGL()
{
    glClear(GL_COLOR_BUFFER_BIT);

    if(!ready) return;

    size_t size;
    checkCudaErrors(cudaGraphicsMapResources(1, &resource, 0));
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void**)&img, &size, resource));

    if(renderMode == RENDER_MODE_RAYCASTING)
    {
        /*glEnable(GL_BLEND);
        glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);

        glBegin(GL_QUADS);
        glColor4f(0.4745f, 0.9294f, 0.8901f, 1.f);
        glVertex2f(1.f, 1.f);

        glColor4f(0.4745f, 0.9294f, 0.8901f, 1.f);
        glVertex2f(-1.f, 1.f);

        glColor4f(0.9490f, 0.9647f, 0.9803f, 1.f);
        glVertex2f(-1.f, -1.f);

        glColor4f(0.9490f, 0.9647f, 0.9803f, 1.f);
        glVertex2f(1.f, -1.f);
        glEnd();*/

        render_raycasting(img, deviceVolume, transferFunction, camera, volumeReader.GetElementBoundingSphereRadius());
    }
    else
    {
        render_pathtracer(img, renderParams);
        if(renderParams.frameNo == 0)
        {
            char* data = new char[WIDTH * HEIGHT * 4];
            cudaMemcpy(data, img, sizeof(glm::u8vec4) * WIDTH * HEIGHT, cudaMemcpyDeviceToHost);

            stbi_write_tga("0.tga", WIDTH, HEIGHT, 4, data);
            delete []data;
        }
    }
    checkCudaErrors(cudaDeviceSynchronize());

    checkCudaErrors(cudaGraphicsUnmapResources(1, &resource, 0));

    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
    glDrawPixels(WIDTH, HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

    glDisable(GL_BLEND);

    renderParams.frameNo++;
}
Example #10
0
// display results using OpenGL
void display()
{
    sdkStartTimer(&timer);

    // execute filter, writing results to pbo
    unsigned int *dResult;

    //DEPRECATED: checkCudaErrors( cudaGLMapBufferObject((void**)&d_result, pbo) );
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&dResult, &num_bytes, cuda_pbo_resource));
    bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);

    // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(pbo));
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    // Common display code path
    {
        glClear(GL_COLOR_BUFFER_BIT);

        // load texture from pbo
        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
        glBindTexture(GL_TEXTURE_2D, texid);
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

        // fragment program is required to display floating point texture
        glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader);
        glEnable(GL_FRAGMENT_PROGRAM_ARB);
        glDisable(GL_DEPTH_TEST);

        glBegin(GL_QUADS);
        {
            glTexCoord2f(0, 0);
            glVertex2f(0, 0);
            glTexCoord2f(1, 0);
            glVertex2f(1, 0);
            glTexCoord2f(1, 1);
            glVertex2f(1, 1);
            glTexCoord2f(0, 1);
            glVertex2f(0, 1);
        }
        glEnd();
        glBindTexture(GL_TEXTURE_TYPE, 0);
        glDisable(GL_FRAGMENT_PROGRAM_ARB);
    }

    glutSwapBuffers();
    glutReportErrors();

    sdkStopTimer(&timer);

    computeFPS();
}
Example #11
0
// YV12/IYUV are both 4:2:0 planar formats (12bpc)
// Luma, U, V chroma planar (12bpc), chroma is subsampled (w/2,h/2)
void
VideoEncoder::CopyYV12orIYUVFrame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock)
{
    // Source is YV12/IYUV, this native format is converted to NV12 format by the video encoder
    // (1) luma copy setup
    CUDA_MEMCPY2D stCopyLuma;
    memset((void *)&stCopyLuma, 0, sizeof(stCopyLuma));
    stCopyLuma.srcXInBytes          = 0;
    stCopyLuma.srcY                 = 0;
    stCopyLuma.srcMemoryType        = CU_MEMORYTYPE_HOST;
    stCopyLuma.srcHost              = sFrameParams.picBuf;
    stCopyLuma.srcDevice            = 0;
    stCopyLuma.srcArray             = 0;
    stCopyLuma.srcPitch             = sFrameParams.Width;

    stCopyLuma.dstXInBytes          = 0;
    stCopyLuma.dstY                 = 0;
    stCopyLuma.dstMemoryType        = CU_MEMORYTYPE_DEVICE;
    stCopyLuma.dstHost              = 0;
    stCopyLuma.dstDevice            = dptr_VideoFrame;
    stCopyLuma.dstArray             = 0;
    stCopyLuma.dstPitch             = m_pEncoderParams->nDeviceMemPitch;

    stCopyLuma.WidthInBytes         = m_pEncoderParams->iInputSize[0];
    stCopyLuma.Height               = m_pEncoderParams->iInputSize[1];

    // (2) chroma copy setup, U/V can be done together
    CUDA_MEMCPY2D stCopyChroma;
    memset((void *)&stCopyChroma, 0, sizeof(stCopyChroma));
    stCopyChroma.srcXInBytes        = 0;
    stCopyChroma.srcY               = m_pEncoderParams->iInputSize[1]<<1; // U/V chroma offset
    stCopyChroma.srcMemoryType      = CU_MEMORYTYPE_HOST;
    stCopyChroma.srcHost            = sFrameParams.picBuf;
    stCopyChroma.srcDevice          = 0;
    stCopyChroma.srcArray           = 0;
    stCopyChroma.srcPitch           = sFrameParams.Width>>1; // chroma is subsampled by 2 (but it has U/V are next to each other)

    stCopyChroma.dstXInBytes        = 0;
    stCopyChroma.dstY               = m_pEncoderParams->iInputSize[1]<<1; // chroma offset (srcY*srcPitch now points to the chroma planes)
    stCopyChroma.dstMemoryType      = CU_MEMORYTYPE_DEVICE;
    stCopyChroma.dstHost            = 0;
    stCopyChroma.dstDevice          = dptr_VideoFrame;
    stCopyChroma.dstArray           = 0;
    stCopyChroma.dstPitch           = m_pEncoderParams->nDeviceMemPitch>>1;

    stCopyChroma.WidthInBytes       = m_pEncoderParams->iInputSize[0]>>1;
    stCopyChroma.Height             = m_pEncoderParams->iInputSize[1]; // U/V are sent together

    // Don't forget we need to lock/unlock between memcopies
    checkCudaErrors(cuvidCtxLock(ctxLock, 0));
    checkCudaErrors(cuMemcpy2D(&stCopyLuma));       // Now DMA Luma
    checkCudaErrors(cuMemcpy2D(&stCopyChroma));     // Now DMA Chroma channels (UV side by side)
    checkCudaErrors(cuvidCtxUnlock(ctxLock, 0));
}
void initCuda(bool useRGBA)
{
    // allocate device memory
    checkCudaErrors(cudaMalloc((void **) &d_img, (width * height * sizeof(unsigned int))));
    checkCudaErrors(cudaMalloc((void **) &d_temp, (width * height * sizeof(unsigned int))));

    // Refer to boxFilter_kernel.cu for implementation
    initTexture(width, height, h_img, useRGBA);

    sdkCreateTimer(&timer);
    sdkCreateTimer(&kernel_timer);
}
void
CudaInterface::initialize() {
    mDevID = 0;
    checkCudaErrors(cudaSetDevice(mDevID));
    checkCudaErrors(cudaGetDevice(&mDevID));
    checkCudaErrors(cudaGetDeviceProperties(&mDeviceProperty, mDevID));
    checkCudaErrors(cublasCreate(&mCublasHandle));
    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", mDevID, mDeviceProperty.name, mDeviceProperty.major, mDeviceProperty.minor);

    // needs a larger block size for Fermi and above
    int block_size = (mDeviceProperty.major < 2) ? 16 : 32;
}
Example #14
0
void RunCuda(struct cudaGraphicsResource **resource)
{
    // map OpenGL buffer object for writing from CUDA
    checkCudaErrors(cudaGraphicsMapResources(1, resource, 0), exit(0));
    float4 *devPtr;
    size_t size;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&devPtr, &size, *resource), exit(0));
    //printf("CUDA mapped VBO: May access %ld bytes\n", size);
    launch_kernel(devPtr, MeshWidth, MeshHeight, _anim);
    // unmap buffer object
    checkCudaErrors(cudaGraphicsUnmapResources(1, resource, 0), exit(0));
}
Example #15
0
void BodySystemGPU<T>::setArray(BodyArray array, const T *data)
{
    assert(m_bInitialized);

    m_currentRead = 0;
    m_currentWrite = 1;

    switch (array)
    {
        default:
        case BODYSYSTEM_POSITION:
        {
            if (m_bUsePBO)
            {
                glBindBuffer(GL_ARRAY_BUFFER, m_pbo[m_currentRead]);
                glBufferSubData(GL_ARRAY_BUFFER, 0, 4 * sizeof(T) * m_numBodies, data);

                int size = 0;
                glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size);

                if ((unsigned)size != 4 * (sizeof(T) * m_numBodies))
                {
                    fprintf(stderr, "WARNING: Pixel Buffer Object download failed!n");
                }

                glBindBuffer(GL_ARRAY_BUFFER, 0);
            }
            else
            {
                if (m_bUseSysMem)
                {
                    memcpy(m_hPos[m_currentRead], data, m_numBodies * 4 * sizeof(T));
                }
                else
                    checkCudaErrors(cudaMemcpy(m_deviceData[0].dPos[m_currentRead], data,
                                               m_numBodies * 4 * sizeof(T),
                                               cudaMemcpyHostToDevice));
            }
        }
            break;

        case BODYSYSTEM_VELOCITY:
            if (m_bUseSysMem)
            {
                memcpy(m_hVel, data, m_numBodies * 4 * sizeof(T));
            }
            else
                checkCudaErrors(cudaMemcpy(m_deviceData[0].dVel, data, m_numBodies * 4 * sizeof(T),
                                           cudaMemcpyHostToDevice));

            break;
    }
}
void initCudaBuffers()
{
    unsigned int size = width * height * sizeof(unsigned int);

    // allocate device memory
    checkCudaErrors(cudaMalloc((void **) &d_img, size));
    checkCudaErrors(cudaMalloc((void **) &d_temp, size));

    checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice));

    sdkCreateTimer(&timer);
}
void
CudaInterface::fillParamMem(ParamMem_t& pmem, int byteVal) {
    checkCudaErrors(cudaSetDevice(mDevID));
    checkCudaErrors(cudaGetDevice(&mDevID));
    std::cout << "  setting " << pmem.totalSize * sizeof(float) << " bytes to " <<  pmem.base << "\n";
    if (pmem.device) {
        checkCudaErrors(cudaThreadSynchronize());
        checkCudaErrors(cudaMemset(pmem.base, byteVal, pmem.totalSize * sizeof(float)));
        checkCudaErrors(cudaThreadSynchronize());
    } else
        memset(pmem.base, byteVal, pmem.totalSize * sizeof(float));
}
Example #18
0
static void addImageToTextureUint (vector<Mat_<uint8_t> > &imgs, cudaTextureObject_t texs[])
{
    for (unsigned int i=0; i<imgs.size(); i++)
    {
        int rows = imgs[i].rows;
        int cols = imgs[i].cols;
        // Create channel with uint8_t point type
        cudaChannelFormatDesc channelDesc =
        //cudaCreateChannelDesc (8,
        //0,
        //0,
        //0,
        //cudaChannelFormatKindUnsigned);
        cudaCreateChannelDesc<char>();
        // Allocate array with correct size and number of channels
        cudaArray *cuArray;
        checkCudaErrors(cudaMallocArray(&cuArray,
                                        &channelDesc,
                                        cols,
                                        rows));

        checkCudaErrors (cudaMemcpy2DToArray (cuArray,
                                              0,
                                              0,
                                              imgs[i].ptr<uint8_t>(),
                                              imgs[i].step[0],
                                              cols*sizeof(uint8_t),
                                              rows,
                                              cudaMemcpyHostToDevice));

        // Specify texture
        struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType         = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;

        // Specify texture object parameters
        struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModePoint;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 0;

        // Create texture object
        //cudaTextureObject_t &texObj = texs[i];
        checkCudaErrors(cudaCreateTextureObject(&(texs[i]), &resDesc, &texDesc, NULL));
        //texs[i] = texObj;
    }
    return;
}
Example #19
0
T* BodySystemGPU<T>::getArray(BodyArray array)
{
    assert(m_bInitialized);

    T *hdata = 0;
    T *ddata = 0;

    cudaGraphicsResource *pgres = nullptr;

    int currentReadHost = m_bUseSysMem ? m_currentRead : 0;

    switch (array)
    {
        default:
        case BODYSYSTEM_POSITION:
            hdata = m_hPos[currentReadHost];
            ddata = m_deviceData[0].dPos[m_currentRead];

            if (m_bUsePBO)
            {
                pgres = m_pGRes[m_currentRead];
            }

            break;

        case BODYSYSTEM_VELOCITY:
            hdata = m_hVel;
            ddata = m_deviceData[0].dVel;
            break;
    }

    if (!m_bUseSysMem)
    {
        if (pgres)
        {
            checkCudaErrors(cudaGraphicsResourceSetMapFlags(pgres, cudaGraphicsMapFlagsReadOnly));
            checkCudaErrors(cudaGraphicsMapResources(1, &pgres, 0));
            size_t bytes;
            checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&ddata, &bytes, pgres));
        }

        checkCudaErrors(cudaMemcpy(hdata, ddata,
                                   m_numBodies*4*sizeof(T), cudaMemcpyDeviceToHost));

        if (pgres)
        {
            checkCudaErrors(cudaGraphicsUnmapResources(1, &pgres, 0));
        }
    }

    return hdata;
}
Example #20
0
GpuCompilationContext::GpuCompilationContext(const void* image,
                                             const std::string& kernel_name,
                                             const int device_id,
                                             const void* cuda_mgr,
                                             unsigned int num_options,
                                             CUjit_option* options,
                                             void** option_vals)
    : module_(nullptr), kernel_(nullptr), device_id_(device_id), cuda_mgr_(cuda_mgr) {
  static_cast<const CudaMgr_Namespace::CudaMgr*>(cuda_mgr_)->setContext(device_id_);
  checkCudaErrors(cuModuleLoadDataEx(&module_, image, num_options, options, option_vals));
  CHECK(module_);
  checkCudaErrors(cuModuleGetFunction(&kernel_, module_, kernel_name.c_str()));
}
Example #21
0
void setupSizeResource()
{
    deleteImage(img);
    free(img_content);
    checkCudaErrors(cuMemFree(d_img_content));

    item_size = width * height * 4;

    img = createImage(width, height);
    img_content = (unsigned char*)malloc(item_size);
    checkCudaErrors(cuMemAlloc(&d_img_content, item_size));
    checkCudaErrors(cuMemcpyHtoD(d_fragColor, &d_img_content, d_fragColor_bytes));
}
Example #22
0
void finalize()
{
    if (!useCpu)
    {
        checkCudaErrors(cudaEventDestroy(startEvent));
        checkCudaErrors(cudaEventDestroy(stopEvent));
        checkCudaErrors(cudaEventDestroy(hostMemSyncEvent));
    }
    
    NBodyDemo<float>::Destroy();

    if (bSupportDouble) NBodyDemo<double>::Destroy();
}
void solve_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * b, cublasHandle_t handle)
{
	double * d_b;
	checkCudaErrors(cudaMalloc(&d_b, gpu_matrix.order*sizeof(double)));
	checkCublasErrors(cublasSetVector(gpu_matrix.order, sizeof(double), b, 1, d_b, 1));
	
	solve_lower_system_on_gpu(gpu_matrix, d_b, handle);
	solve_upper_system_on_gpu(gpu_matrix, d_b, handle);

	checkCublasErrors(cublasGetVector(gpu_matrix.order, sizeof(double), d_b, 1, b, 1));

	checkCudaErrors(cudaFree(d_b));
}
Example #24
0
// This test specifies a single test (where you specify radius and/or iterations)
int runSingleTest(char *ref_file, char *exec_path)
{
    int nTotalErrors = 0;
    char dump_file[256];

    printf("[runSingleTest]: [%s]\n", sSDKsample);

    initCuda();

    unsigned int *dResult;
    unsigned int *hResult = (unsigned int *)malloc(width * height * sizeof(unsigned int));
    size_t pitch;
    checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height));

    // run the sample radius
    {
        printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations);
        bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);

        // check if kernel execution generated an error
        getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED");
        checkCudaErrors(cudaDeviceSynchronize());

        // readback the results to system memory
        cudaMemcpy2D(hResult, sizeof(unsigned int)*width, dResult, pitch,
                     sizeof(unsigned int)*width, height, cudaMemcpyDeviceToHost);

        sprintf(dump_file, "nature_%02d.ppm", filter_radius);

        sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, height);

        if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false))
        {
            printf("Image is Different ");
            nTotalErrors++;
        }
        else
        {
            printf("Image is Matching ");
        }

        printf(" <%s>\n", ref_file);
    }
    printf("\n");

    free(hResult);
    checkCudaErrors(cudaFree(dResult));

    return nTotalErrors;
}
void
CudaInterface::freeParamMem(ParamMem_t& pmem) {
    checkCudaErrors(cudaSetDevice(mDevID));
    checkCudaErrors(cudaGetDevice(&mDevID));
    if (pmem.device)
        checkCudaErrors(cudaFree(pmem.base));
    else
        free(pmem.base);
    pmem.base = NULL;
    pmem.softmax = NULL;
    pmem.transformW = NULL;
    pmem.transformV = NULL;
    pmem.wordVecs = NULL;
}
Example #26
0
void BodySystemGPU<T>::setSoftening(T softening)
{
    T softeningSq = softening*softening;

    for (unsigned int i = 0; i < m_numDevices; i++)
    {
        if (m_numDevices > 1)
        {
            checkCudaErrors(cudaSetDevice(i));
        }

        checkCudaErrors(setSofteningSquared(softeningSq));
    }
}
// This is the normal display path
void display(void)
{
    sdkStartTimer(&timer);

    // Sobel operation
    Pixel *data = NULL;

    // map PBO to get CUDA device pointer
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&data, &num_bytes,
                                                         cuda_pbo_resource));
    //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);

    sobelFilter(data, imWidth, imHeight, g_SobelDisplayMode, imageScale);
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    glClear(GL_COLOR_BUFFER_BIT);

    glBindTexture(GL_TEXTURE_2D, texid);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer);
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight,
                    GL_LUMINANCE, GL_UNSIGNED_BYTE, OFFSET(0));
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

    glDisable(GL_DEPTH_TEST);
    glEnable(GL_TEXTURE_2D);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);

    glBegin(GL_QUADS);
    glVertex2f(0, 0);
    glTexCoord2f(0, 0);
    glVertex2f(0, 1);
    glTexCoord2f(1, 0);
    glVertex2f(1, 1);
    glTexCoord2f(1, 1);
    glVertex2f(1, 0);
    glTexCoord2f(0, 1);
    glEnd();
    glBindTexture(GL_TEXTURE_2D, 0);
    glutSwapBuffers();

    sdkStopTimer(&timer);

    computeFPS();
}
void initGLBuffers()
{
    if (pbo)
    {
        // delete old buffer
        checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));
        glDeleteBuffersARB(1, &pbo);
    }

    // create pixel buffer object for display
    glGenBuffersARB(1, &pbo);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
    glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(uchar4), 0, GL_STREAM_DRAW_ARB);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo,
                    cudaGraphicsMapFlagsWriteDiscard));

#if USE_BUFFER_TEX

    // create buffer texture, attach to pbo
    if (bufferTex)
    {
        glDeleteTextures(1, &bufferTex);
    }

    glGenTextures(1, &bufferTex);
    glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex);
    glTexBufferEXT(GL_TEXTURE_BUFFER_EXT, GL_RGBA8, pbo);
    glBindTexture(GL_TEXTURE_BUFFER_EXT, 0);
#else

    // create texture for display
    if (displayTex)
    {
        glDeleteTextures(1, &displayTex);
    }

    glGenTextures(1, &displayTex);
    glBindTexture(GL_TEXTURE_TYPE, displayTex);
    glTexImage2D(GL_TEXTURE_TYPE, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
    glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    glBindTexture(GL_TEXTURE_TYPE, 0);
#endif

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));
}
Example #29
0
int main(int argc, char **argv) {
  uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
  uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
  unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

  float *h_filter;
  int    filterWidth;

  std::string input_file;
  std::string output_file;
  if (argc == 3) {
    input_file  = std::string(argv[1]);
    output_file = std::string(argv[2]);
  }
  else {
    std::cerr << "Usage: ./hw input_file output_file" << std::endl;
    exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
             &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
             &h_filter, &filterWidth, input_file);

  allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
  GpuTimer timer;
  timer.Start();
  //call the students' code
  your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(),
                     d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
  timer.Stop();
  cudaDeviceSynchronize(); //checkCudaErrors(cudaGetLastError());
  int err = printf("%f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  cleanup();
  //check results and output the blurred image
  postProcess(output_file);

  checkCudaErrors(cudaFree(d_redBlurred));
  checkCudaErrors(cudaFree(d_greenBlurred));
  checkCudaErrors(cudaFree(d_blueBlurred));

  return 0;
}
Example #30
0
        void _runBenchmark(int iterations)
        {
            // once without timing to prime the device
            if (!useCpu)
            {
                m_nbody->update(activeParams.m_timestep);
            }

            if (useCpu)
            {
                sdkCreateTimer(&timer);
                sdkStartTimer(&timer);
            }
            else
            {
                checkCudaErrors(cudaEventRecord(startEvent, 0));
            }

            for (int i = 0; i < iterations; ++i)
            {
                m_nbody->update(activeParams.m_timestep);
            }

            float milliseconds = 0;

            if (useCpu)
            {
                sdkStopTimer(&timer);
                milliseconds = sdkGetTimerValue(&timer);
                sdkStartTimer(&timer);
            }
            else
            {
                checkCudaErrors(cudaEventRecord(stopEvent, 0));
                checkCudaErrors(cudaEventSynchronize(stopEvent));
                checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent));
            }

            double interactionsPerSecond = 0;
            double gflops = 0;
            computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations);

            printf("%d bodies, total time for %d iterations: %.3f ms, mean %f\n",
                   numBodies, iterations, milliseconds, milliseconds/iterations);
            printf("= %.3f billion interactions per second\n", interactionsPerSecond);
            printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops,
                   (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction);
        }