CUresult loadCUDAModules() { CUmodule cuModule_; checkCudaErrors(cuModuleLoad(&cuModule_, "videoPP64.ptx")); checkCudaErrors(cuModuleGetFunction(&g_kernelNV12toARGB, cuModule_, "NV12ToARGBdrvapi")); checkCudaErrors(cuModuleGetFunction(&g_kernelARGBtoNV12, cuModule_, "ARGBToNv12drvapi")); checkCudaErrors(cuModuleGetFunction(&g_kernelARGBpostprocess, cuModule_, "ARGBpostprocess")); }
int main(int argc, char** argv) { float fTotalTime = 0; // int TARGET_WIDTH=atoi(argv[2]); // int TARGET_HEIGHT=atoi(argv[3]); // bool visualize_results=atoi(argv[4]); // unsigned int kernel_size=atoi(argv[2]); int gpuNr=atoi(argv[2]); checkCudaErrors(cudaSetDevice(gpuNr)); IplImage* gray_image = cvLoadImage(argv[1],CV_LOAD_IMAGE_GRAYSCALE); unsigned char * d_input_image; unsigned char * d_output_image; int widthImage=gray_image->width; int heightImage=gray_image->height; IplImage *output_image = cvCreateImage(cvSize(widthImage,heightImage), IPL_DEPTH_8U, 1); for( int i=0;i<heightImage;i++) for( int j=0;j<widthImage;j++) output_image->imageData[i*widthImage+j]=255; unsigned int * d_histogram; int total_threads=256; cudaMalloc(&d_histogram,sizeof(unsigned int)*256*total_threads); checkCudaErrors(cudaMalloc(&d_input_image,widthImage*heightImage*sizeof(unsigned char))); checkCudaErrors(cudaMalloc(&d_output_image,widthImage*heightImage*sizeof(unsigned char))); checkCudaErrors(cudaMemcpy(d_input_image,gray_image->imageData,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyHostToDevice)); unsigned int windows_array[4]={15,17,25,31}; int total_implementations=4; double elapsed_time; for (int i=1;i<=total_implementations;i++) { for( int j=0;j<4;j++) { timer my_timer; MedianFilterUcharCUDA(d_input_image,d_output_image,d_histogram,widthImage,heightImage,windows_array[j],16,16,i); cudaThreadSynchronize(); elapsed_time=my_timer.elapsed(); printf("elapsed_time for implementation %d for window size %d was %f \n",i,windows_array[j],elapsed_time); } } timer array_timer; arrayFireRows(d_input_image,d_output_image,widthImage,heightImage,3,16,16); cudaThreadSynchronize(); elapsed_time=array_timer.elapsed(); printf("elapsed_time for array fire was %f \n",elapsed_time); checkCudaErrors(cudaMemcpy(output_image->imageData,d_output_image,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyDeviceToHost)); // _medianfilter((unsigned char *)gray_image->imageData, (unsigned char *)output_image->imageData, widthImage, heightImage); cvSaveImage("output.jpg",output_image); }
void mmf::OptSO3MMFvMF::init() { std::cout << "mmf::OptSO3MMFvMF::init()" << std::endl; std::cout << 3*6*K() << std::endl; checkCudaErrors(cudaMalloc((void **)&d_cost, K()*6*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_mu_, K()*6*3*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_N_, sizeof(int))); loadRGBvaluesForMFaxes(); };
/* * This function load the ptx file ptxPath and extract the kernel kName * to phKernel * @param phKernel Output kernel handle * @param ptxPath ptx file name * @param kName kernel name */ void ptxJIT(CUmodule *phModule, CUfunction *phKernel, const char *ptxPath, const char *kName) { CUlinkState cuLinkState; CUjit_option options[6]; void *optionVals[6]; float walltime; char error_log[8192], info_log[8192]; unsigned int logSize = 8192; void *cuOut; size_t outSize; int myErr = 0; // Setup linker options // Return walltime from JIT compilation options[0] = CU_JIT_WALL_TIME; optionVals[0] = (void *) &walltime; // Pass a buffer for info messages options[1] = CU_JIT_INFO_LOG_BUFFER; optionVals[1] = (void *) info_log; // Pass the size of the info buffer options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; optionVals[2] = (void *) (long)logSize; // Pass a buffer for error message options[3] = CU_JIT_ERROR_LOG_BUFFER; optionVals[3] = (void *) error_log; // Pass the size of the error buffer options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; optionVals[4] = (void *) (long) logSize; // Make the linker verbose options[5] = CU_JIT_LOG_VERBOSE; optionVals[5] = (void *) 1; // Create a pending linker invocation checkCudaErrors(cuLinkCreate(6,options, optionVals, &cuLinkState)); // Load the ptx from the file myErr = cuLinkAddFile(cuLinkState, CU_JIT_INPUT_PTX, ptxPath, 0, 0, 0); if (myErr != CUDA_SUCCESS){ // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. fprintf(stderr,"PTX Linker Error:\n%s\n",error_log); } // Complete the linker step checkCudaErrors(cuLinkComplete(cuLinkState, &cuOut, &outSize)); // Linker walltime and info_log were requested in options above. printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime, info_log); // Load resulting cuBin into module checkCudaErrors(cuModuleLoadData(phModule, cuOut)); // Locate the kernel entry point checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, kName)); // Destroy the linker invocation checkCudaErrors(cuLinkDestroy(cuLinkState)); }
void displayFunc(void) { sdkStartTimer(&timer); TColor *d_dst = NULL; size_t num_bytes; if (frameCounter++ == 0) { sdkResetTimer(&timer); } // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, gl_PBO)); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); getLastCudaError("cudaGraphicsMapResources failed"); checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource)); getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); checkCudaErrors(CUDA_Bind2TextureArray()); runImageFilters(d_dst); checkCudaErrors(CUDA_UnbindTexture()); // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); // Common display code path { glClear(GL_COLOR_BUFFER_BIT); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0)); glBegin(GL_TRIANGLES); glTexCoord2f(0, 0); glVertex2f(-1, -1); glTexCoord2f(2, 0); glVertex2f(+3, -1); glTexCoord2f(0, 2); glVertex2f(-1, +3); glEnd(); glFinish(); } if (frameCounter == frameN) { frameCounter = 0; if (g_FPS) { printf("FPS: %3.1f\n", frameN / (sdkGetTimerValue(&timer) * 0.001)); g_FPS = false; } } glutSwapBuffers(); sdkStopTimer(&timer); computeFPS(); }
void Mandelbrot::WriteBuffer() { checkCudaErrors( cudaGLMapBufferObject( ( void** ) &this->devArray, this->buffer ), __LINE__, false ); cudaMemcpy( this->devArray, this->devCalcArray, this->iSize, cudaMemcpyDeviceToDevice ); checkCudaErrors( cudaGLUnmapBufferObject( this->buffer ), __LINE__, false ); this->bIsFlushed = true; }
BaseData<Dtype>::~BaseData() { if (cpu_data_ != NULL){ checkCudaErrors(cudaFreeHost(cpu_data_)); } if (gpu_data_ != NULL){ checkCudaErrors(cudaFree(gpu_data_)); } }
void Renderer::render(const Camera& camera, float time) { // calc cam vars glm::vec3 A,B,C; { // camera ray C = glm::normalize(camera.getLookAt()-camera.getPosition()); // calc A (screen x) // calc B (screen y) then scale down relative to aspect // fov is for screen x axis A = glm::normalize(glm::cross(C,camera.getUp())); B = 1.0f/camera.getAspect()*glm::normalize(glm::cross(A,C)); // scale by FOV float tanFOV = tan(glm::radians(camera.getFOV())); A *= tanFOV; B *= tanFOV; } // cuda call unsigned int* out_data; checkCudaErrors(cudaGLMapBufferObject((void**)&out_data, pbo)); if (mode == RAYTRACE) { raytrace1(out_data, image_width, image_height, time, camera.getPosition(), A, B, C, scene_d, sceneSize); } else if (mode == PATHTRACE) { ++filmIters; pathtrace(out_data, image_width, image_height, time, camera.getPosition(), A, B, C, camera.m_lensRadius, camera.m_focalDist, scene_d, sceneSize, rand_d, rays_d, col_d, idx_d, film_d, filmIters); } checkCudaErrors(cudaGLUnmapBufferObject(pbo)); // download texture from destination PBO glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo); glActiveTexture(GL_TEXTURE0 + RENDER_TEXTURE); glBindTexture(GL_TEXTURE_2D, result_texture); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, NULL); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glActiveTexture(GL_TEXTURE0 + UNUSED_TEXTURE); SDK_CHECK_ERROR_GL(); fullScreenQuad.display(); }
void Canvas::paintGL() { glClear(GL_COLOR_BUFFER_BIT); if(!ready) return; size_t size; checkCudaErrors(cudaGraphicsMapResources(1, &resource, 0)); checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void**)&img, &size, resource)); if(renderMode == RENDER_MODE_RAYCASTING) { /*glEnable(GL_BLEND); glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); glBegin(GL_QUADS); glColor4f(0.4745f, 0.9294f, 0.8901f, 1.f); glVertex2f(1.f, 1.f); glColor4f(0.4745f, 0.9294f, 0.8901f, 1.f); glVertex2f(-1.f, 1.f); glColor4f(0.9490f, 0.9647f, 0.9803f, 1.f); glVertex2f(-1.f, -1.f); glColor4f(0.9490f, 0.9647f, 0.9803f, 1.f); glVertex2f(1.f, -1.f); glEnd();*/ render_raycasting(img, deviceVolume, transferFunction, camera, volumeReader.GetElementBoundingSphereRadius()); } else { render_pathtracer(img, renderParams); if(renderParams.frameNo == 0) { char* data = new char[WIDTH * HEIGHT * 4]; cudaMemcpy(data, img, sizeof(glm::u8vec4) * WIDTH * HEIGHT, cudaMemcpyDeviceToHost); stbi_write_tga("0.tga", WIDTH, HEIGHT, 4, data); delete []data; } } checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaGraphicsUnmapResources(1, &resource, 0)); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo); glDrawPixels(WIDTH, HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, NULL); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glDisable(GL_BLEND); renderParams.frameNo++; }
// display results using OpenGL void display() { sdkStartTimer(&timer); // execute filter, writing results to pbo unsigned int *dResult; //DEPRECATED: checkCudaErrors( cudaGLMapBufferObject((void**)&d_result, pbo) ); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&dResult, &num_bytes, cuda_pbo_resource)); bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(pbo)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); // Common display code path { glClear(GL_COLOR_BUFFER_BIT); // load texture from pbo glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindTexture(GL_TEXTURE_2D, texid); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); // fragment program is required to display floating point texture glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader); glEnable(GL_FRAGMENT_PROGRAM_ARB); glDisable(GL_DEPTH_TEST); glBegin(GL_QUADS); { glTexCoord2f(0, 0); glVertex2f(0, 0); glTexCoord2f(1, 0); glVertex2f(1, 0); glTexCoord2f(1, 1); glVertex2f(1, 1); glTexCoord2f(0, 1); glVertex2f(0, 1); } glEnd(); glBindTexture(GL_TEXTURE_TYPE, 0); glDisable(GL_FRAGMENT_PROGRAM_ARB); } glutSwapBuffers(); glutReportErrors(); sdkStopTimer(&timer); computeFPS(); }
// YV12/IYUV are both 4:2:0 planar formats (12bpc) // Luma, U, V chroma planar (12bpc), chroma is subsampled (w/2,h/2) void VideoEncoder::CopyYV12orIYUVFrame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock) { // Source is YV12/IYUV, this native format is converted to NV12 format by the video encoder // (1) luma copy setup CUDA_MEMCPY2D stCopyLuma; memset((void *)&stCopyLuma, 0, sizeof(stCopyLuma)); stCopyLuma.srcXInBytes = 0; stCopyLuma.srcY = 0; stCopyLuma.srcMemoryType = CU_MEMORYTYPE_HOST; stCopyLuma.srcHost = sFrameParams.picBuf; stCopyLuma.srcDevice = 0; stCopyLuma.srcArray = 0; stCopyLuma.srcPitch = sFrameParams.Width; stCopyLuma.dstXInBytes = 0; stCopyLuma.dstY = 0; stCopyLuma.dstMemoryType = CU_MEMORYTYPE_DEVICE; stCopyLuma.dstHost = 0; stCopyLuma.dstDevice = dptr_VideoFrame; stCopyLuma.dstArray = 0; stCopyLuma.dstPitch = m_pEncoderParams->nDeviceMemPitch; stCopyLuma.WidthInBytes = m_pEncoderParams->iInputSize[0]; stCopyLuma.Height = m_pEncoderParams->iInputSize[1]; // (2) chroma copy setup, U/V can be done together CUDA_MEMCPY2D stCopyChroma; memset((void *)&stCopyChroma, 0, sizeof(stCopyChroma)); stCopyChroma.srcXInBytes = 0; stCopyChroma.srcY = m_pEncoderParams->iInputSize[1]<<1; // U/V chroma offset stCopyChroma.srcMemoryType = CU_MEMORYTYPE_HOST; stCopyChroma.srcHost = sFrameParams.picBuf; stCopyChroma.srcDevice = 0; stCopyChroma.srcArray = 0; stCopyChroma.srcPitch = sFrameParams.Width>>1; // chroma is subsampled by 2 (but it has U/V are next to each other) stCopyChroma.dstXInBytes = 0; stCopyChroma.dstY = m_pEncoderParams->iInputSize[1]<<1; // chroma offset (srcY*srcPitch now points to the chroma planes) stCopyChroma.dstMemoryType = CU_MEMORYTYPE_DEVICE; stCopyChroma.dstHost = 0; stCopyChroma.dstDevice = dptr_VideoFrame; stCopyChroma.dstArray = 0; stCopyChroma.dstPitch = m_pEncoderParams->nDeviceMemPitch>>1; stCopyChroma.WidthInBytes = m_pEncoderParams->iInputSize[0]>>1; stCopyChroma.Height = m_pEncoderParams->iInputSize[1]; // U/V are sent together // Don't forget we need to lock/unlock between memcopies checkCudaErrors(cuvidCtxLock(ctxLock, 0)); checkCudaErrors(cuMemcpy2D(&stCopyLuma)); // Now DMA Luma checkCudaErrors(cuMemcpy2D(&stCopyChroma)); // Now DMA Chroma channels (UV side by side) checkCudaErrors(cuvidCtxUnlock(ctxLock, 0)); }
void initCuda(bool useRGBA) { // allocate device memory checkCudaErrors(cudaMalloc((void **) &d_img, (width * height * sizeof(unsigned int)))); checkCudaErrors(cudaMalloc((void **) &d_temp, (width * height * sizeof(unsigned int)))); // Refer to boxFilter_kernel.cu for implementation initTexture(width, height, h_img, useRGBA); sdkCreateTimer(&timer); sdkCreateTimer(&kernel_timer); }
void CudaInterface::initialize() { mDevID = 0; checkCudaErrors(cudaSetDevice(mDevID)); checkCudaErrors(cudaGetDevice(&mDevID)); checkCudaErrors(cudaGetDeviceProperties(&mDeviceProperty, mDevID)); checkCudaErrors(cublasCreate(&mCublasHandle)); printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", mDevID, mDeviceProperty.name, mDeviceProperty.major, mDeviceProperty.minor); // needs a larger block size for Fermi and above int block_size = (mDeviceProperty.major < 2) ? 16 : 32; }
void RunCuda(struct cudaGraphicsResource **resource) { // map OpenGL buffer object for writing from CUDA checkCudaErrors(cudaGraphicsMapResources(1, resource, 0), exit(0)); float4 *devPtr; size_t size; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&devPtr, &size, *resource), exit(0)); //printf("CUDA mapped VBO: May access %ld bytes\n", size); launch_kernel(devPtr, MeshWidth, MeshHeight, _anim); // unmap buffer object checkCudaErrors(cudaGraphicsUnmapResources(1, resource, 0), exit(0)); }
void BodySystemGPU<T>::setArray(BodyArray array, const T *data) { assert(m_bInitialized); m_currentRead = 0; m_currentWrite = 1; switch (array) { default: case BODYSYSTEM_POSITION: { if (m_bUsePBO) { glBindBuffer(GL_ARRAY_BUFFER, m_pbo[m_currentRead]); glBufferSubData(GL_ARRAY_BUFFER, 0, 4 * sizeof(T) * m_numBodies, data); int size = 0; glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size); if ((unsigned)size != 4 * (sizeof(T) * m_numBodies)) { fprintf(stderr, "WARNING: Pixel Buffer Object download failed!n"); } glBindBuffer(GL_ARRAY_BUFFER, 0); } else { if (m_bUseSysMem) { memcpy(m_hPos[m_currentRead], data, m_numBodies * 4 * sizeof(T)); } else checkCudaErrors(cudaMemcpy(m_deviceData[0].dPos[m_currentRead], data, m_numBodies * 4 * sizeof(T), cudaMemcpyHostToDevice)); } } break; case BODYSYSTEM_VELOCITY: if (m_bUseSysMem) { memcpy(m_hVel, data, m_numBodies * 4 * sizeof(T)); } else checkCudaErrors(cudaMemcpy(m_deviceData[0].dVel, data, m_numBodies * 4 * sizeof(T), cudaMemcpyHostToDevice)); break; } }
void initCudaBuffers() { unsigned int size = width * height * sizeof(unsigned int); // allocate device memory checkCudaErrors(cudaMalloc((void **) &d_img, size)); checkCudaErrors(cudaMalloc((void **) &d_temp, size)); checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice)); sdkCreateTimer(&timer); }
void CudaInterface::fillParamMem(ParamMem_t& pmem, int byteVal) { checkCudaErrors(cudaSetDevice(mDevID)); checkCudaErrors(cudaGetDevice(&mDevID)); std::cout << " setting " << pmem.totalSize * sizeof(float) << " bytes to " << pmem.base << "\n"; if (pmem.device) { checkCudaErrors(cudaThreadSynchronize()); checkCudaErrors(cudaMemset(pmem.base, byteVal, pmem.totalSize * sizeof(float))); checkCudaErrors(cudaThreadSynchronize()); } else memset(pmem.base, byteVal, pmem.totalSize * sizeof(float)); }
static void addImageToTextureUint (vector<Mat_<uint8_t> > &imgs, cudaTextureObject_t texs[]) { for (unsigned int i=0; i<imgs.size(); i++) { int rows = imgs[i].rows; int cols = imgs[i].cols; // Create channel with uint8_t point type cudaChannelFormatDesc channelDesc = //cudaCreateChannelDesc (8, //0, //0, //0, //cudaChannelFormatKindUnsigned); cudaCreateChannelDesc<char>(); // Allocate array with correct size and number of channels cudaArray *cuArray; checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, cols, rows)); checkCudaErrors (cudaMemcpy2DToArray (cuArray, 0, 0, imgs[i].ptr<uint8_t>(), imgs[i].step[0], cols*sizeof(uint8_t), rows, cudaMemcpyHostToDevice)); // Specify texture struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = cuArray; // Specify texture object parameters struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeWrap; texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.filterMode = cudaFilterModePoint; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 0; // Create texture object //cudaTextureObject_t &texObj = texs[i]; checkCudaErrors(cudaCreateTextureObject(&(texs[i]), &resDesc, &texDesc, NULL)); //texs[i] = texObj; } return; }
T* BodySystemGPU<T>::getArray(BodyArray array) { assert(m_bInitialized); T *hdata = 0; T *ddata = 0; cudaGraphicsResource *pgres = nullptr; int currentReadHost = m_bUseSysMem ? m_currentRead : 0; switch (array) { default: case BODYSYSTEM_POSITION: hdata = m_hPos[currentReadHost]; ddata = m_deviceData[0].dPos[m_currentRead]; if (m_bUsePBO) { pgres = m_pGRes[m_currentRead]; } break; case BODYSYSTEM_VELOCITY: hdata = m_hVel; ddata = m_deviceData[0].dVel; break; } if (!m_bUseSysMem) { if (pgres) { checkCudaErrors(cudaGraphicsResourceSetMapFlags(pgres, cudaGraphicsMapFlagsReadOnly)); checkCudaErrors(cudaGraphicsMapResources(1, &pgres, 0)); size_t bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&ddata, &bytes, pgres)); } checkCudaErrors(cudaMemcpy(hdata, ddata, m_numBodies*4*sizeof(T), cudaMemcpyDeviceToHost)); if (pgres) { checkCudaErrors(cudaGraphicsUnmapResources(1, &pgres, 0)); } } return hdata; }
GpuCompilationContext::GpuCompilationContext(const void* image, const std::string& kernel_name, const int device_id, const void* cuda_mgr, unsigned int num_options, CUjit_option* options, void** option_vals) : module_(nullptr), kernel_(nullptr), device_id_(device_id), cuda_mgr_(cuda_mgr) { static_cast<const CudaMgr_Namespace::CudaMgr*>(cuda_mgr_)->setContext(device_id_); checkCudaErrors(cuModuleLoadDataEx(&module_, image, num_options, options, option_vals)); CHECK(module_); checkCudaErrors(cuModuleGetFunction(&kernel_, module_, kernel_name.c_str())); }
void setupSizeResource() { deleteImage(img); free(img_content); checkCudaErrors(cuMemFree(d_img_content)); item_size = width * height * 4; img = createImage(width, height); img_content = (unsigned char*)malloc(item_size); checkCudaErrors(cuMemAlloc(&d_img_content, item_size)); checkCudaErrors(cuMemcpyHtoD(d_fragColor, &d_img_content, d_fragColor_bytes)); }
void finalize() { if (!useCpu) { checkCudaErrors(cudaEventDestroy(startEvent)); checkCudaErrors(cudaEventDestroy(stopEvent)); checkCudaErrors(cudaEventDestroy(hostMemSyncEvent)); } NBodyDemo<float>::Destroy(); if (bSupportDouble) NBodyDemo<double>::Destroy(); }
void solve_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * b, cublasHandle_t handle) { double * d_b; checkCudaErrors(cudaMalloc(&d_b, gpu_matrix.order*sizeof(double))); checkCublasErrors(cublasSetVector(gpu_matrix.order, sizeof(double), b, 1, d_b, 1)); solve_lower_system_on_gpu(gpu_matrix, d_b, handle); solve_upper_system_on_gpu(gpu_matrix, d_b, handle); checkCublasErrors(cublasGetVector(gpu_matrix.order, sizeof(double), d_b, 1, b, 1)); checkCudaErrors(cudaFree(d_b)); }
// This test specifies a single test (where you specify radius and/or iterations) int runSingleTest(char *ref_file, char *exec_path) { int nTotalErrors = 0; char dump_file[256]; printf("[runSingleTest]: [%s]\n", sSDKsample); initCuda(); unsigned int *dResult; unsigned int *hResult = (unsigned int *)malloc(width * height * sizeof(unsigned int)); size_t pitch; checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height)); // run the sample radius { printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations); bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); // check if kernel execution generated an error getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); // readback the results to system memory cudaMemcpy2D(hResult, sizeof(unsigned int)*width, dResult, pitch, sizeof(unsigned int)*width, height, cudaMemcpyDeviceToHost); sprintf(dump_file, "nature_%02d.ppm", filter_radius); sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, height); if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false)) { printf("Image is Different "); nTotalErrors++; } else { printf("Image is Matching "); } printf(" <%s>\n", ref_file); } printf("\n"); free(hResult); checkCudaErrors(cudaFree(dResult)); return nTotalErrors; }
void CudaInterface::freeParamMem(ParamMem_t& pmem) { checkCudaErrors(cudaSetDevice(mDevID)); checkCudaErrors(cudaGetDevice(&mDevID)); if (pmem.device) checkCudaErrors(cudaFree(pmem.base)); else free(pmem.base); pmem.base = NULL; pmem.softmax = NULL; pmem.transformW = NULL; pmem.transformV = NULL; pmem.wordVecs = NULL; }
void BodySystemGPU<T>::setSoftening(T softening) { T softeningSq = softening*softening; for (unsigned int i = 0; i < m_numDevices; i++) { if (m_numDevices > 1) { checkCudaErrors(cudaSetDevice(i)); } checkCudaErrors(setSofteningSquared(softeningSq)); } }
// This is the normal display path void display(void) { sdkStartTimer(&timer); // Sobel operation Pixel *data = NULL; // map PBO to get CUDA device pointer checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&data, &num_bytes, cuda_pbo_resource)); //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); sobelFilter(data, imWidth, imHeight, g_SobelDisplayMode, imageScale); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); glClear(GL_COLOR_BUFFER_BIT); glBindTexture(GL_TEXTURE_2D, texid); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, GL_UNSIGNED_BYTE, OFFSET(0)); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glDisable(GL_DEPTH_TEST); glEnable(GL_TEXTURE_2D); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); glBegin(GL_QUADS); glVertex2f(0, 0); glTexCoord2f(0, 0); glVertex2f(0, 1); glTexCoord2f(1, 0); glVertex2f(1, 1); glTexCoord2f(1, 1); glVertex2f(1, 0); glTexCoord2f(0, 1); glEnd(); glBindTexture(GL_TEXTURE_2D, 0); glutSwapBuffers(); sdkStopTimer(&timer); computeFPS(); }
void initGLBuffers() { if (pbo) { // delete old buffer checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); glDeleteBuffersARB(1, &pbo); } // create pixel buffer object for display glGenBuffersARB(1, &pbo); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(uchar4), 0, GL_STREAM_DRAW_ARB); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); #if USE_BUFFER_TEX // create buffer texture, attach to pbo if (bufferTex) { glDeleteTextures(1, &bufferTex); } glGenTextures(1, &bufferTex); glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex); glTexBufferEXT(GL_TEXTURE_BUFFER_EXT, GL_RGBA8, pbo); glBindTexture(GL_TEXTURE_BUFFER_EXT, 0); #else // create texture for display if (displayTex) { glDeleteTextures(1, &displayTex); } glGenTextures(1, &displayTex); glBindTexture(GL_TEXTURE_TYPE, displayTex); glTexImage2D(GL_TEXTURE_TYPE, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glBindTexture(GL_TEXTURE_TYPE, 0); #endif // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); }
int main(int argc, char **argv) { uchar4 *h_inputImageRGBA, *d_inputImageRGBA; uchar4 *h_outputImageRGBA, *d_outputImageRGBA; unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred; float *h_filter; int filterWidth; std::string input_file; std::string output_file; if (argc == 3) { input_file = std::string(argv[1]); output_file = std::string(argv[2]); } else { std::cerr << "Usage: ./hw input_file output_file" << std::endl; exit(1); } //load the image and give us our input and output pointers preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA, &d_redBlurred, &d_greenBlurred, &d_blueBlurred, &h_filter, &filterWidth, input_file); allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth); GpuTimer timer; timer.Start(); //call the students' code your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(), d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth); timer.Stop(); cudaDeviceSynchronize(); //checkCudaErrors(cudaGetLastError()); int err = printf("%f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } cleanup(); //check results and output the blurred image postProcess(output_file); checkCudaErrors(cudaFree(d_redBlurred)); checkCudaErrors(cudaFree(d_greenBlurred)); checkCudaErrors(cudaFree(d_blueBlurred)); return 0; }
void _runBenchmark(int iterations) { // once without timing to prime the device if (!useCpu) { m_nbody->update(activeParams.m_timestep); } if (useCpu) { sdkCreateTimer(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventRecord(startEvent, 0)); } for (int i = 0; i < iterations; ++i) { m_nbody->update(activeParams.m_timestep); } float milliseconds = 0; if (useCpu) { sdkStopTimer(&timer); milliseconds = sdkGetTimerValue(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventRecord(stopEvent, 0)); checkCudaErrors(cudaEventSynchronize(stopEvent)); checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); printf("%d bodies, total time for %d iterations: %.3f ms, mean %f\n", numBodies, iterations, milliseconds, milliseconds/iterations); printf("= %.3f billion interactions per second\n", interactionsPerSecond); printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); }