inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad){ size_t pitch; // common choice for cuda mem align unit is 32 if( pad && obj.shape[0] >= MSHADOW_MIN_PAD_RATIO * 32 ){ cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \ obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] ); utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) ); }else{ obj.shape.stride_ = obj.shape[0]; cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \ obj.shape.Size() * sizeof(real_t), 1 ); utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); } }
void CudaUtil::cudaCheckMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height, int line, const char* file) { int error = cudaMallocPitch(ptr, pitch, width, height); if (error != cudaSuccess) { std::ostringstream os; os << "cudaMallocPitch returned error code " << error << ", line " << line << ", in file " << file; throw CudaException(os.str()); } }
void ScanPlan::allocate(size_t elemSizeBytes, size_t numElements, size_t numRows, size_t rowPitch) { const size_t blockSize = SCAN_ELTS_PER_THREAD * SCAN_CTA_SIZE; m_numElements = numElements; m_numRows = numRows; m_elemSizeBytes = elemSizeBytes; // find required number of levels size_t level = 0; size_t numElts = m_numElements; do { size_t numBlocks = (numElts + blockSize - 1) / blockSize; if (numBlocks > 1) { level++; } numElts = numBlocks; } while (numElts > 1); m_numLevels = level; m_blockSums = (void**) malloc(m_numLevels * sizeof(void*)); if (m_numRows > 1) { m_rowPitches = (size_t*) malloc((m_numLevels + 1) * sizeof(size_t)); m_rowPitches[0] = rowPitch; } // allocate storage for block sums numElts = m_numElements; level = 0; do { size_t numBlocks = (numElts + blockSize - 1) / blockSize; if (numBlocks > 1) { // Use cudaMallocPitch for multi-row block sums to ensure alignment if (m_numRows > 1) { size_t dpitch; cudaSafeCall(cudaMallocPitch((void**)&(m_blockSums[level]), &dpitch, numBlocks * m_elemSizeBytes, numRows)); m_rowPitches[level+1] = dpitch / m_elemSizeBytes; } else { cudaSafeCall(cudaMalloc((void**)&(m_blockSums[level]), numBlocks * m_elemSizeBytes)); } level++; } numElts = numBlocks; } while (numElts > 1); cudaCheckMsg("ScanPlan::allocate"); }
RenderTarget::RenderTarget(COM::size_t width, COM::size_t height) : _texture(width, height, GL_RGBA32F, GL_RGBA, GL_FLOAT) { int sdfheight = _texture.Height(); CUDA_CALL(cudaMallocPitch((void**)&_deviceMem, &_pitch, width * sizeof(float) * 4, height)); CUDA_CALL(cudaGraphicsGLRegisterImage(&_resource, _texture.GetID(), GL_TEXTURE_2D, cudaGraphicsMapFlagsNone)); }
lcudaMatrix lcudaAllocMatrix(int width, int height) { lcudaMatrix matrix; cudaMallocPitch((void **)&matrix.data, (size_t*)&matrix.pitch, width * sizeof(lcudaFloat), height); matrix.width = width; matrix.height = height; return matrix; }
// This test specifies a single test (where you specify radius and/or iterations) int runSingleTest(char *ref_file, char *exec_path) { int nTotalErrors = 0; char dump_file[256]; printf("[runSingleTest]: [%s]\n", sSDKsample); initCuda(); unsigned int *dResult; unsigned int *hResult = (unsigned int *)malloc(width * height * sizeof(unsigned int)); size_t pitch; checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height)); // run the sample radius { printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations); bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); // check if kernel execution generated an error getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); // readback the results to system memory cudaMemcpy2D(hResult, sizeof(unsigned int)*width, dResult, pitch, sizeof(unsigned int)*width, height, cudaMemcpyDeviceToHost); sprintf(dump_file, "nature_%02d.ppm", filter_radius); sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, height); if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false)) { printf("Image is Different "); nTotalErrors++; } else { printf("Image is Matching "); } printf(" <%s>\n", ref_file); } printf("\n"); free(hResult); checkCudaErrors(cudaFree(dResult)); return nTotalErrors; }
HRESULT RegisterD3D9ResourceWithCUDA() { // 2D // register the Direct3D resources that we'll use // we'll read to and write from g_texture_2d, so don't set any special map flags for it cudaGraphicsD3D9RegisterResource(&g_texture_2d.cudaResource, g_texture_2d.pTexture, cudaGraphicsRegisterFlagsNone); getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_2d) failed"); // cuda cannot write into the texture directly : the texture is seen as a cudaArray and can only be mapped as a texture // Create a buffer so that cuda can write into it // pixel fmt is DXGI_FORMAT_R32G32B32A32_FLOAT cudaMallocPitch(&g_texture_2d.cudaLinearMemory, &g_texture_2d.pitch, g_texture_2d.width * sizeof(float) * 4, g_texture_2d.height); getLastCudaError("cudaMallocPitch (g_texture_2d) failed"); cudaMemset(g_texture_2d.cudaLinearMemory, 1, g_texture_2d.pitch * g_texture_2d.height); // CUBE cudaGraphicsD3D9RegisterResource(&g_texture_cube.cudaResource, g_texture_cube.pTexture, cudaGraphicsRegisterFlagsNone); getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_cube) failed"); // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM cudaMallocPitch(&g_texture_cube.cudaLinearMemory, &g_texture_cube.pitch, g_texture_cube.size * 4, g_texture_cube.size); getLastCudaError("cudaMallocPitch (g_texture_cube) failed"); cudaMemset(g_texture_cube.cudaLinearMemory, 1, g_texture_cube.pitch * g_texture_cube.size); getLastCudaError("cudaMemset (g_texture_cube) failed"); // 3D cudaGraphicsD3D9RegisterResource(&g_texture_vol.cudaResource, g_texture_vol.pTexture, cudaGraphicsRegisterFlagsNone); getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_vol) failed"); // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM //cudaMallocPitch(&g_texture_vol.cudaLinearMemory, &g_texture_vol.pitch, g_texture_vol.width * 4, g_texture_vol.height * g_texture_vol.depth); cudaMalloc(&g_texture_vol.cudaLinearMemory, g_texture_vol.width * 4 * g_texture_vol.height * g_texture_vol.depth); g_texture_vol.pitch = g_texture_vol.width * 4; getLastCudaError("cudaMallocPitch (g_texture_vol) failed"); cudaMemset(g_texture_vol.cudaLinearMemory, 1, g_texture_vol.pitch * g_texture_vol.height * g_texture_vol.depth); getLastCudaError("cudaMemset (g_texture_vol) failed"); return S_OK; }
GLFluids::GLFluids(QWidget *parent) : QGLWidget(parent), QGLFunctions() { vbo = 0; wWidth = qMax(512, DIM); wHeight = qMax(512, DIM); hvfield = (float2 *)malloc(sizeof(float2) * DS); memset(hvfield, 0, sizeof(float2) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(float2)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(float2) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(float2) * PDS); cudaMalloc((void **)&vyfield, sizeof(float2) * PDS); setup_texture(DIM, DIM); bind_texture(); // Create particle array particles = (float2 *)malloc(sizeof(float2) * DS); memset(particles, 0, sizeof(float2) * DS); initParticles(particles, DIM, DIM); // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); QTimer *timer = new QTimer(this); connect(timer, &QTimer::timeout, [&](){ simulateFluids(); updateGL(); }); timer->start(0); }
boost::shared_ptr<DeviceMatrix> makeDeviceMatrix(size_t height, size_t width) { DeviceMatrix* mat = new DeviceMatrix(); mat->width = width; mat->height = height; CUDA_CALL (cudaMallocPitch((void**)&mat->data, &mat->pitch, mat->width * sizeof(float), mat->height)); // I can't imagine getting a pitch that's not a multiple of a float assert(mat->pitch % sizeof(float) == 0); // We want to express everything in floats mat->pitch /= sizeof(float); //printf("cudaMalloc: %p\n", mat->data); return boost::shared_ptr<DeviceMatrix>(mat, deleteDeviceMatrix); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// int runBenchmark(int argc, char **argv) { printf("[runBenchmark]: [%s]\n", sSDKsample); loadImageData(argc, argv); initCuda(); unsigned int *dResult; size_t pitch; checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height)); sdkStartTimer(&kernel_timer); // warm-up bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); checkCudaErrors(cudaDeviceSynchronize()); // Start round-trip timer and process iCycles loops on the GPU iterations = 1; // standard 1-pass filtering const int iCycles = 150; double dProcessingTime = 0.0; printf("\nRunning BilateralFilterGPU for %d cycles...\n\n", iCycles); for (int i = 0; i < iCycles; i++) { dProcessingTime += bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); } // check if kernel execution generated an error and sync host getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&kernel_timer); // Get average computation time dProcessingTime /= (double)iCycles; // log testname, throughput, timing and config info to sample and master logs printf("bilateralFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u\n", (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1); printf("\n"); return 0; }
cudaError_t GridGpu::copyTrajBlocks() { // Copy gridding k-trajectory data int maxP = 0; for (int i = 0; i < m_trajBlocks.size(); i++) { if (m_trajBlocks[i].size() > maxP) maxP = m_trajBlocks[i].size(); } m_d_trajBlocks.trajWidth = maxP; cudaMallocPitch(&m_d_trajBlocks.trajPoints, &m_d_trajBlocks.pitchTraj, maxP * sizeof(TrajPointGpu), m_trajBlocks.size()); cudaMemset(m_d_trajBlocks.trajPoints, 0, m_d_trajBlocks.pitchTraj * m_trajBlocks.size()); qWarning() << "Max traj points per block:" << maxP; for (int i = 0; i < m_trajBlocks.size(); i++) { char *row = (char *)m_d_trajBlocks.trajPoints + i * m_d_trajBlocks.pitchTraj; cudaMemcpy(row, m_trajBlocks[i].data(), m_trajBlocks[i].size() * sizeof(TrajPointGpu), cudaMemcpyHostToDevice); } return cudaGetLastError(); }
void vm::scanner::cuda::DeviceMemory2D::create(int rows_arg, int colsBytes_arg) { if (colsBytes_ == colsBytes_arg && rows_ == rows_arg) return; if( rows_arg > 0 && colsBytes_arg > 0) { if( data_ ) release(); colsBytes_ = colsBytes_arg; rows_ = rows_arg; cudaSafeCall( cudaMallocPitch( (void**)&data_, &step_, colsBytes_, rows_) ); //refcount = (int*)cv::fastMalloc(sizeof(*refcount)); refcount_ = new int; *refcount_ = 1; } }
void pcl::gpu::DeviceMemory2D::create(int rows_arg, int colsBytes_arg) { if (colsBytes_ == colsBytes_arg && rows_ == rows_arg) return; if( rows_arg > 0 && colsBytes_arg > 0) { if( data_ ) release(); colsBytes_ = colsBytes_arg; rows_ = rows_arg; printf( "[CUDA] Allocating memory %d x %d = %d bytes.\n", colsBytes_, rows_, colsBytes_ * rows_ ); cudaSafeCall( cudaMallocPitch( (void**)&data_, &step_, colsBytes_, rows_) ); //refcount = (int*)cv::fastMalloc(sizeof(*refcount)); refcount_ = new int; *refcount_ = 1; } }
DeviceMatrix3D::Ptr makeDeviceMatrix3D(size_t dim_t, size_t dim_y, size_t dim_x){ DeviceMatrix3D* mat = new DeviceMatrix3D(); mat->dim_x = dim_x; mat->dim_y = dim_y; mat->dim_t = dim_t; size_t pitch; CUDA_CALL (cudaMallocPitch((void**)&mat->data, &pitch, dim_x * sizeof(float), dim_y * dim_t)); // I can't imagine getting a pitch that's not a multiple of a float assert(pitch % sizeof(float) == 0); // We want to express everything in floats pitch /= sizeof(float); mat->pitch_y = pitch; mat->pitch_t = dim_y*mat->pitch_y; return DeviceMatrix3D::Ptr(mat, deleteDeviceMatrix3D); }
// WARNING: ignorePitch = true should only be used for testing! void CuWrapper::CuInit(int device, bool ignorePitch) { if(!CuInitialized) { Generic::Print("Initializing device"); CuExe(cudaSetDevice(device), "cudaSetDevice failed, no device found"); CuExe(cudaGetDeviceProperties(&CuProperties, 0), "Getting device properties failed"); // Make sure the default block size does not exceed the maximum of the device if(CuBlockDim.x * CuBlockDim.y * CuBlockDim.z > CuProperties.maxThreadsPerBlock) { int dim = (int)floor(sqrt((float)CuProperties.maxThreadsPerBlock)); dim = dim > 16 ? dim - dim % 16 : dim; CuBlockDim.x = dim; CuBlockDim.y = dim; CuBlockDim.z = 1; } // Determine the device pitch in bytes by allocating an integer array of dimension 1x1. // Use this to pad matrices on the host if(ignorePitch) { // WARNING: this will only make CuGetPitchSize work for CU_TYPE! IgnorePitch should only be used for testing CuPitchBytes = sizeof(CU_TYPE); } else { int *tmp; CuExe(cudaMallocPitch((void**)&tmp, &CuPitchBytes, sizeof(int), 1)); CuExe(cudaFree(tmp)); } CuInitialized = true; } }
void run_2D_GLOBAL_MEMORY() { int arrayWidth = 4; int arrayHeight = 4; bool SEQ = true; /* Host allocation */ float* inArr_1_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float)); float* inArr_2_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float)); float* outArr_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float)); /* Fill arrays */ int index = 0; if (SEQ) { int ctr = 0; for(int j = 0; j < (arrayHeight); j++) { for(int i = 0; i < (arrayWidth); i++) { index = ((j * arrayWidth) + i); inArr_1_H[index] = (float) ctr++; inArr_2_H[index] = (float) ctr++; outArr_H[index] = (float) 0; } } } else { for(int j = 0; j < (arrayHeight); j++) { for(int i = 0; i < (arrayWidth); i++) { index = ((j * arrayWidth) + i); inArr_1_H[index] = (float)rand()/(float)RAND_MAX; inArr_2_H[index] = (float)rand()/(float)RAND_MAX; outArr_H[index] = 0; } } } /* Print host arrays */ printf("inArr_1_H \n"); print_2D_Array(inArr_1_H, arrayWidth, arrayHeight); printf("inArr_2_H \n"); print_2D_Array(inArr_2_H, arrayWidth, arrayHeight); /* Device allocation + <__pitch> */ float *inArr_1_D, *inArr_2_D, *outArr_D; size_t __pitch; cudaMallocPitch((void**)&inArr_1_D, &__pitch, arrayHeight * sizeof(float), arrayWidth); cudaMallocPitch((void**)&inArr_2_D, &__pitch, arrayHeight * sizeof(float), arrayWidth); cudaMallocPitch((void**)&outArr_D, &__pitch, arrayHeight * sizeof(float), arrayWidth); /* Print __pitch */ printf("__pitch %d \n", (__pitch/sizeof(float))); /* Uploading data */ cudaMemcpy2D(inArr_1_D, __pitch, inArr_1_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice); cudaMemcpy2D(inArr_2_D, __pitch, inArr_2_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice); /* Gridding */ dim3 __numBlocks(1,1,1); dim3 __numThreadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1); __numBlocks.x = ((arrayWidth / BLOCK_SIZE) + (((arrayWidth) % BLOCK_SIZE) == 0 ? 0:1)); __numBlocks.y = ((arrayHeight / BLOCK_SIZE) + (((arrayHeight) % BLOCK_SIZE) == 0 ? 0:1)); /* Kernel invokation */ add_2D_Array(inArr_1_D, inArr_2_D, outArr_D, arrayWidth, arrayHeight, __pitch, __numBlocks, __numThreadsPerBlock); /* Synchronization */ cudaThreadSynchronize(); /* Download result */ cudaMemcpy2D(outArr_H, arrayHeight * sizeof(float), outArr_D, __pitch, arrayHeight * sizeof(float), arrayWidth, cudaMemcpyDeviceToHost); /* Free device arrays */ cudaFree(inArr_1_D); cudaFree(inArr_2_D); cudaFree(outArr_D); /* Display results */ printf("outArr \n"); print_2D_Array(outArr_H, arrayWidth, arrayHeight); }
//////////////////////////////////////////////////////////////////////////////// // Program Main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { int Nx, Ny, Nz, max_iters; int blockX, blockY, blockZ; if (argc == 8) { Nx = atoi(argv[1]); Ny = atoi(argv[2]); Nz = atoi(argv[3]); max_iters = atoi(argv[4]); blockX = atoi(argv[5]); blockY = atoi(argv[6]); blockZ = atoi(argv[7]); } else { printf("Usage: %s nx ny nz i block_x block_y block_z number_of_threads\n", argv[0]); exit(1); } // Get the number of GPUS int number_of_devices; checkCuda(cudaGetDeviceCount(&number_of_devices)); if (number_of_devices < 2) { printf("Less than two devices were found.\n"); printf("Exiting...\n"); return -1; } // Decompose along the Z-axis int _Nz = Nz/number_of_devices; // Define constants const _DOUBLE_ L = 1.0; const _DOUBLE_ h = L/(Nx+1); const _DOUBLE_ dt = h*h/6.0; const _DOUBLE_ beta = dt/(h*h); const _DOUBLE_ c0 = beta; const _DOUBLE_ c1 = (1-6*beta); // Check if ECC is turned on ECCCheck(number_of_devices); // Set the number of OpenMP threads omp_set_num_threads(number_of_devices); #pragma omp parallel { unsigned int tid = omp_get_num_threads(); #pragma omp single { printf("Number of OpenMP threads: %d\n", tid); } } // CPU memory operations int dt_size = sizeof(_DOUBLE_); _DOUBLE_ *u_new, *u_old; u_new = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); u_old = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); init(u_old, u_new, h, Nx, Ny, Nz); // Allocate and generate arrays on the host size_t pitch_bytes; size_t pitch_gc_bytes; _DOUBLE_ *h_Unew, *h_Uold; _DOUBLE_ *h_s_Uolds[number_of_devices], *h_s_Unews[number_of_devices]; _DOUBLE_ *left_send_buffer[number_of_devices], *left_receive_buffer[number_of_devices]; _DOUBLE_ *right_send_buffer[number_of_devices], *right_receive_buffer[number_of_devices]; h_Unew = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); h_Uold = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); init(h_Uold, h_Unew, h, Nx, Ny, Nz); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); h_s_Unews[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); h_s_Uolds[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); right_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); right_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); checkCuda(cudaHostAlloc((void**)&h_s_Unews[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&h_s_Uolds[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); init_subdomain(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid); } // GPU memory operations _DOUBLE_ *d_s_Unews[number_of_devices], *d_s_Uolds[number_of_devices]; _DOUBLE_ *d_right_send_buffer[number_of_devices], *d_left_send_buffer[number_of_devices]; _DOUBLE_ *d_right_receive_buffer[number_of_devices], *d_left_receive_buffer[number_of_devices]; #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); CopyToConstantMemory(c0, c1); checkCuda(cudaMallocPitch((void**)&d_s_Uolds[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_s_Unews[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_left_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); } // Copy data from host to the device double HtD_timer = 0.; HtD_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_s_Uolds[tid], pitch_bytes, h_s_Uolds[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); checkCuda(cudaMemcpy2D(d_s_Unews[tid], pitch_bytes, h_s_Unews[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); } HtD_timer += omp_get_wtime(); int pitch = pitch_bytes/dt_size; int gc_pitch = pitch_gc_bytes/dt_size; // GPU kernel launch parameters dim3 threads_per_block(blockX, blockY, blockZ); unsigned int blocksInX = getBlock(Nx, blockX); unsigned int blocksInY = getBlock(Ny, blockY); unsigned int blocksInZ = getBlock(_Nz-2, k_loop); dim3 thread_blocks(blocksInX, blocksInY, blocksInZ); dim3 thread_blocks_halo(blocksInX, blocksInY); double compute_timer = 0.; compute_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); for(int iterations = 0; iterations < max_iters; iterations++) { // Compute inner nodes checkCuda(cudaSetDevice(tid)); ComputeInnerPoints(thread_blocks, threads_per_block, d_s_Unews[tid], d_s_Uolds[tid], pitch, Nx, Ny, _Nz); // Copy right boundary data to host if (tid == 0) { checkCuda(cudaSetDevice(tid)); CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0); checkCuda(cudaMemcpy2D(right_send_buffer[tid], dt_size*(Nx+2), d_right_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault)); } // Copy left boundary data to host if (tid == 1) { checkCuda(cudaSetDevice(tid)); CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1); checkCuda(cudaMemcpy2D(left_send_buffer[tid], dt_size*(Nx+2), d_left_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault)); } #pragma omp barrier // Copy right boundary data to device 1 if (tid == 1) { checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_left_receive_buffer[tid], pitch_gc_bytes, right_send_buffer[tid-1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault)); CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1); } // Copy left boundary data to device 0 if (tid == 0) { checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_right_receive_buffer[tid], pitch_gc_bytes, left_send_buffer[tid+1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault)); CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0); } // Swap pointers on the host #pragma omp barrier checkCuda(cudaSetDevice(tid)); checkCuda(cudaDeviceSynchronize()); swap(_DOUBLE_*, d_s_Unews[tid], d_s_Uolds[tid]); } } compute_timer += omp_get_wtime(); // Copy data from device to host double DtH_timer = 0; DtH_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(h_s_Uolds[tid], dt_size*(Nx+2), d_s_Uolds[tid], pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDeviceToHost)); } DtH_timer += omp_get_wtime(); // Merge sub-domains into a one big domain #pragma omp parallel { unsigned int tid = omp_get_thread_num(); merge_domains(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid); } // Calculate on host #if defined(DEBUG) || defined(_DEBUG) cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz); #endif float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz); PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx); _DOUBLE_ t = max_iters * dt; CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz); #if defined(DEBUG) || defined(_DEBUG) //exportToVTK(h_Uold, h, "heat3D.vtk", Nx, Ny, Nz); #endif #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaFree(d_s_Unews[tid])); checkCuda(cudaFree(d_s_Uolds[tid])); checkCuda(cudaFree(d_right_send_buffer[tid])); checkCuda(cudaFree(d_left_send_buffer[tid])); checkCuda(cudaFree(d_right_receive_buffer[tid])); checkCuda(cudaFree(d_left_receive_buffer[tid])); checkCuda(cudaFreeHost(h_s_Unews[tid])); checkCuda(cudaFreeHost(h_s_Uolds[tid])); checkCuda(cudaFreeHost(left_send_buffer[tid])); checkCuda(cudaFreeHost(right_send_buffer[tid])); checkCuda(cudaFreeHost(left_receive_buffer[tid])); checkCuda(cudaFreeHost(right_receive_buffer[tid])); checkCuda(cudaDeviceReset()); } free(u_old); free(u_new); return 0; }
void run_back_projection_with_normal_estimate( std::vector<float4>& vertices, std::vector<float4>& normals, const std::vector<ushort>& depth_buffer, uint width, uint height, ushort max_depth) { StopWatchInterface *kernel_timer = nullptr; ushort* h_depth_buffer = (ushort*)depth_buffer.data(); size_t in_pitch, out_pitch; ushort* d_depth_buffer = nullptr; // copy image data to array checkCudaErrors(cudaMallocPitch(&d_depth_buffer, &in_pitch, sizeof(ushort) * width, height)); checkCudaErrors(cudaMemcpy2D( d_depth_buffer, in_pitch, h_depth_buffer, sizeof(ushort) * width, sizeof(ushort) * width, height, cudaMemcpyHostToDevice)); float4* d_vertex_buffer; checkCudaErrors(cudaMallocPitch( &d_vertex_buffer, &out_pitch, width * sizeof(float4), height)); float4* d_normal_buffer; checkCudaErrors(cudaMallocPitch( &d_normal_buffer, &out_pitch, width * sizeof(float4), height)); sdkCreateTimer(&kernel_timer); sdkStartTimer(&kernel_timer); Eigen::Matrix4f h_inverse_projection = perspective_matrix_inverse<float>(fov_y, aspect_ratio, near_plane, far_plane); //bilateralFilter_normal_estimate_float4((OutputPixelType*)dOutputImage, (InputPixelType*)dInputImage, width, height, in_pitch, out_pitch, max_depth, euclidean_delta, filter_radius, iterations, kernel_timer); back_projection_with_normal_estimation(d_vertex_buffer, d_normal_buffer, d_depth_buffer, width, height, max_depth, in_pitch, out_pitch, h_inverse_projection.data()); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&kernel_timer); std::cout << "Kernel Timer : " << kernel_timer->getTime() << " msec" << std::endl; sdkDeleteTimer(&kernel_timer); vertices.resize(depth_buffer.size()); normals.resize(depth_buffer.size()); cudaMemcpy2D( vertices.data(), sizeof(float4) * width, d_vertex_buffer, out_pitch, sizeof(float4) * width, height, cudaMemcpyDeviceToHost); cudaMemcpy2D( normals.data(), sizeof(float4) * width, d_normal_buffer, out_pitch, sizeof(float4) * width, height, cudaMemcpyDeviceToHost); checkCudaErrors(cudaFree(d_depth_buffer)); checkCudaErrors(cudaFree(d_vertex_buffer)); checkCudaErrors(cudaFree(d_normal_buffer)); }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s #ifndef OPTIMUS devID = findCudaGLDevice(argc, (const char **)argv); #else devID = gpuGetMaxGflopsDeviceId(); #endif // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array in host memory particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); #ifdef BROADCAST int step = 1; // Broadcasted visualization stepping. if (argc > 3) step = atoi(argv[3]); // Create additional space to store particle packets // for broadcasting. wstep = step; hstep = step; int npackets = sizeof(float) * (DIM / wstep) * (DIM / hstep) / UdpBroadcastServer::PacketSize; if (sizeof(float) * (DIM / wstep) * (DIM / hstep) % UdpBroadcastServer::PacketSize) npackets++; packets = (char*)malloc(npackets * (UdpBroadcastServer::PacketSize + sizeof(unsigned int))); #endif initParticles(particles, DIM, DIM); #if defined(OPTIMUS) || defined(BROADCAST) // Create particle array in device memory cudaMalloc((void **)&particles_gpu, sizeof(cData) * DS); cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice); #endif // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); #ifndef OPTIMUS checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); #endif if (ref_file) { autoTest(argv); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { #ifdef BROADCAST const char *sv_addr = "127.0.0:9097"; const char *bc_addr = "127.255.255.2:9097"; // Server address if (argc > 2) sv_addr = argv[2]; // Broadcast address if (argc > 1) bc_addr = argv[1]; server.reset(new UdpBroadcastServer(sv_addr, bc_addr)); // Listen to clients' feedbacks in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &feedback_listener, &step); } // Broadcast the particles state in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &broadcaster, &step); } #endif #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif glutMainLoop(); } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_FAILURE); }
cudaError_t WINAPI wine_cudaMallocPitch( void** devPtr, size_t* pitch, size_t widthInBytes, size_t height ) { WINE_TRACE("\n"); return cudaMallocPitch( devPtr, pitch, widthInBytes, height ); }
/////////////////////// // Main program entry /////////////////////// int main(int argc, char** argv) { unsigned int max_iters, Nx, Ny, Nz, blockX, blockY, blockZ; int rank, numberOfProcesses; if (argc == 8) { Nx = atoi(argv[1]); Ny = atoi(argv[2]); Nz = atoi(argv[3]); max_iters = atoi(argv[4]); blockX = atoi(argv[5]); blockY = atoi(argv[6]); blockZ = atoi(argv[7]); } else { printf("Usage: %s nx ny nz i block_x block_y block_z\n", argv[0]); exit(1); } InitializeMPI(&argc, &argv, &rank, &numberOfProcesses); AssignDevices(rank); ECCCheck(rank); // Define constants const _DOUBLE_ L = 1.0; const _DOUBLE_ h = L/(Nx+1); const _DOUBLE_ dt = h*h/6.0; const _DOUBLE_ beta = dt/(h*h); const _DOUBLE_ c0 = beta; const _DOUBLE_ c1 = (1-6*beta); // Copy constants to Constant Memory on the GPUs CopyToConstantMemory(c0, c1); // Decompose along the z-axis const int _Nz = Nz/numberOfProcesses; const int dt_size = sizeof(_DOUBLE_); // Host memory allocations _DOUBLE_ *u_new, *u_old; _DOUBLE_ *h_Uold; u_new = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); u_old = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); if (rank == 0) { h_Uold = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); } init(u_old, u_new, h, Nx, Ny, Nz); // Allocate and generate host subdomains _DOUBLE_ *h_s_Uolds, *h_s_Unews, *h_s_rbuf[numberOfProcesses]; _DOUBLE_ *left_send_buffer, *left_receive_buffer; _DOUBLE_ *right_send_buffer, *right_receive_buffer; h_s_Unews = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); h_s_Uolds = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { h_s_rbuf[i] = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); checkCuda(cudaHostAlloc((void**)&h_s_rbuf[i], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); } } #endif right_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); right_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); checkCuda(cudaHostAlloc((void**)&h_s_Unews, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&h_s_Uolds, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); init_subdomain(h_s_Uolds, u_old, Nx, Ny, _Nz, rank); // GPU stream operations cudaStream_t compute_stream; cudaStream_t data_stream; checkCuda(cudaStreamCreate(&compute_stream)); checkCuda(cudaStreamCreate(&data_stream)); // GPU Memory Operations size_t pitch_bytes, pitch_gc_bytes; _DOUBLE_ *d_s_Unews, *d_s_Uolds; _DOUBLE_ *d_right_send_buffer, *d_left_send_buffer; _DOUBLE_ *d_right_receive_buffer, *d_left_receive_buffer; checkCuda(cudaMallocPitch((void**)&d_s_Uolds, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_s_Unews, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_left_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); // Copy subdomains from host to device and get walltime double HtD_timer = 0.; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); HtD_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); checkCuda(cudaMemcpy2D(d_s_Uolds, pitch_bytes, h_s_Uolds, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); checkCuda(cudaMemcpy2D(d_s_Unews, pitch_bytes, h_s_Unews, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); HtD_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); unsigned int ghost_width = 1; int pitch = pitch_bytes/dt_size; int gc_pitch = pitch_gc_bytes/dt_size; // GPU kernel launch parameters dim3 threads_per_block(blockX, blockY, blockZ); unsigned int blocksInX = getBlock(Nx, blockX); unsigned int blocksInY = getBlock(Ny, blockY); unsigned int blocksInZ = getBlock(_Nz-2, k_loop); dim3 thread_blocks(blocksInX, blocksInY, blocksInZ); dim3 thread_blocks_halo(blocksInX, blocksInY); //MPI_Status status; MPI_Status status[numberOfProcesses]; MPI_Request gather_send_request[numberOfProcesses]; MPI_Request right_send_request[numberOfProcesses], left_send_request[numberOfProcesses]; MPI_Request right_receive_request[numberOfProcesses], left_receive_request[numberOfProcesses]; double compute_timer = 0.; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); compute_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); for(unsigned int iterations = 0; iterations < max_iters; iterations++) { // Compute right boundary data on device 0 if (rank == 0) { int kstart = (_Nz+1)-ghost_width; int kstop = _Nz+1; ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0); checkCuda(cudaMemcpy2DAsync(right_send_buffer, dt_size*(Nx+2), d_right_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream)); checkCuda(cudaStreamSynchronize(data_stream)); MPI_CHECK(MPI_Isend(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &right_send_request[rank])); } else { int kstart = 1; int kstop = 1+ghost_width; ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1); checkCuda(cudaMemcpy2DAsync(left_send_buffer, dt_size*(Nx+2), d_left_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream)); checkCuda(cudaStreamSynchronize(data_stream)); MPI_CHECK(MPI_Isend(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &left_send_request[rank])); } // Compute inner nodes for device 0 if (rank == 0) { int kstart = 1; int kstop = (_Nz+1)-ghost_width; ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); } // Compute inner nodes for device 1 else { int kstart = 1+ghost_width; int kstop = _Nz+1; ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); } // Receive data from device 1 if (rank == 0) { MPI_CHECK(MPI_Irecv(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &right_receive_request[rank])); } else { MPI_CHECK(MPI_Irecv(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &left_receive_request[rank])); } if (rank == 0) { MPI_CHECK(MPI_Wait(&right_receive_request[rank], &status[rank])); checkCuda(cudaMemcpy2DAsync(d_right_receive_buffer, pitch_gc_bytes, left_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream)); CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0); } else { MPI_CHECK(MPI_Wait(&left_receive_request[rank], &status[rank])); checkCuda(cudaMemcpy2DAsync(d_left_receive_buffer, pitch_gc_bytes, right_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream)); CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1); } if (rank == 0) { MPI_CHECK(MPI_Wait(&right_send_request[rank], MPI_STATUS_IGNORE)); } else { MPI_CHECK(MPI_Wait(&left_send_request[rank], MPI_STATUS_IGNORE)); } // Swap pointers on the host checkCuda(cudaDeviceSynchronize()); swap(_DOUBLE_*, d_s_Unews, d_s_Uolds); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); compute_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); // Copy data from device to host double DtH_timer = 0; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); DtH_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); checkCuda(cudaMemcpy2D(h_s_Uolds, dt_size*(Nx+2), d_s_Uolds, pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDefault)); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); DtH_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); // Gather results from subdomains MPI_CHECK(MPI_Isend(h_s_Uolds, (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &gather_send_request[rank])); if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { MPI_CHECK(MPI_Recv(h_s_rbuf[i], (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &status[rank])); merge_domains(h_s_rbuf[i], h_Uold, Nx, Ny, _Nz, i); } } // Calculate on host #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz); } #endif if (rank == 0) { float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz); PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx); _DOUBLE_ t = max_iters * dt; CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz); } Finalize(); // Free device memory checkCuda(cudaFree(d_s_Unews)); checkCuda(cudaFree(d_s_Uolds)); checkCuda(cudaFree(d_right_send_buffer)); checkCuda(cudaFree(d_left_send_buffer)); checkCuda(cudaFree(d_right_receive_buffer)); checkCuda(cudaFree(d_left_receive_buffer)); // Free host memory checkCuda(cudaFreeHost(h_s_Unews)); checkCuda(cudaFreeHost(h_s_Uolds)); #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { checkCuda(cudaFreeHost(h_s_rbuf[i])); } free(h_Uold); } #endif checkCuda(cudaFreeHost(left_send_buffer)); checkCuda(cudaFreeHost(left_receive_buffer)); checkCuda(cudaFreeHost(right_send_buffer)); checkCuda(cudaFreeHost(right_receive_buffer)); checkCuda(cudaDeviceReset()); free(u_old); free(u_new); return 0; }
float WFIRFilterCuda::cudaFilter( WLEMData::ScalarT* const output, const WLEMData::ScalarT* const input, const WLEMData::ScalarT* const previous, size_t channels, size_t samples, const WLEMData::ScalarT* const coeffs, size_t coeffSize ) { CuScalarT *dev_in = NULL; size_t pitchIn; CuScalarT *dev_prev = NULL; size_t pitchPrev; CuScalarT *dev_out = NULL; size_t pitchOut; CuScalarT *dev_co = NULL; try { CudaThrowsCall( cudaMallocPitch( ( void** )&dev_in, &pitchIn, samples * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMemcpy2D( dev_in, pitchIn, input, samples * sizeof( CuScalarT ), samples * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) ); CudaThrowsCall( cudaMallocPitch( ( void** )&dev_prev, &pitchPrev, coeffSize * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMemcpy2D( dev_prev, pitchPrev, previous, coeffSize * sizeof( CuScalarT ), coeffSize * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) ); CudaThrowsCall( cudaMallocPitch( ( void** )&dev_out, &pitchOut, samples * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMalloc( ( void** )&dev_co, coeffSize * sizeof( CuScalarT ) ) ); CudaThrowsCall( cudaMemcpy( dev_co, coeffs, coeffSize * sizeof( CuScalarT ), cudaMemcpyHostToDevice ) ); } catch( const WException& e ) { wlog::error( CLASS ) << e.what(); if( dev_in ) { CudaSafeCall( cudaFree( ( void* )dev_in ) ); } if( dev_prev ) { CudaSafeCall( cudaFree( ( void* )dev_prev ) ); } if( dev_out ) { CudaSafeCall( cudaFree( ( void* )dev_out ) ); } if( dev_co ) { CudaSafeCall( cudaFree( ( void* )dev_co ) ); } throw WLBadAllocException( "Could not allocate CUDA memory!" ); } size_t threadsPerBlock = 32; size_t blocksPerGrid = ( samples + threadsPerBlock - 1 ) / threadsPerBlock; size_t sharedMem = coeffSize * sizeof( CuScalarT ); cudaEvent_t start, stop; cudaEventCreate( &start ); cudaEventCreate( &stop ); cudaEventRecord( start, 0 ); cuFirFilter( blocksPerGrid, threadsPerBlock, sharedMem, dev_out, dev_in, dev_prev, channels, samples, dev_co, coeffSize, pitchOut, pitchIn, pitchPrev ); cudaError_t kernelError = cudaGetLastError(); cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); float elapsedTime; cudaEventElapsedTime( &elapsedTime, start, stop ); cudaEventDestroy( start ); cudaEventDestroy( stop ); try { if( kernelError != cudaSuccess ) { const std::string err( cudaGetErrorString( kernelError ) ); throw WException( "CUDA kernel failed: " + err ); } CudaThrowsCall( cudaMemcpy2D( output, samples * sizeof( CuScalarT ), dev_out, pitchOut, samples * sizeof( CuScalarT ), channels, cudaMemcpyDeviceToHost ) ); } catch( const WException& e ) { wlog::error( CLASS ) << e.what(); elapsedTime = -1.0; } CudaSafeCall( cudaFree( ( void* )dev_in ) ); CudaSafeCall( cudaFree( ( void* )dev_prev ) ); CudaSafeCall( cudaFree( ( void* )dev_out ) ); CudaSafeCall( cudaFree( ( void* )dev_co ) ); if( elapsedTime > -1.0 ) { return elapsedTime; } else { throw WException( "Error in cudaFilter()" ); } }
FrameSource::FrameStatus GStreamerBaseFrameSourceImpl::fetch(vx_image image, vx_uint32 /*timeout*/) { if (end) { close(); return FrameSource::CLOSED; } handleGStreamerMessages(); if (gst_app_sink_is_eos(GST_APP_SINK(sink))) { close(); return FrameSource::CLOSED; } if ((lastFrameTimestamp.toc()/1000.0) > Application::get().getSourceDefaultTimeout()) { close(); return FrameSource::CLOSED; } lastFrameTimestamp.tic(); #if GST_VERSION_MAJOR == 0 std::unique_ptr<GstBuffer, GStreamerObjectDeleter> bufferHolder( gst_app_sink_pull_buffer(GST_APP_SINK(sink))); GstBuffer* buffer = bufferHolder.get(); #else std::unique_ptr<GstSample, GStreamerObjectDeleter> sample(gst_app_sink_pull_sample(GST_APP_SINK(sink))); if (!sample) { close(); return FrameSource::CLOSED; } GstBuffer* buffer = gst_sample_get_buffer(sample.get()); #endif gint width; gint height; #if GST_VERSION_MAJOR == 0 std::unique_ptr<GstCaps, GStreamerObjectDeleter> bufferCapsHolder(gst_buffer_get_caps(buffer)); GstCaps* bufferCaps = bufferCapsHolder.get(); #else GstCaps* bufferCaps = gst_sample_get_caps(sample.get()); #endif // bail out in no caps assert(gst_caps_get_size(bufferCaps) == 1); GstStructure* structure = gst_caps_get_structure(bufferCaps, 0); // bail out if width or height are 0 if (!gst_structure_get_int(structure, "width", &width) || !gst_structure_get_int(structure, "height", &height)) { close(); return FrameSource::CLOSED; } int depth = 3; #if GST_VERSION_MAJOR > 0 depth = 0; const gchar* name = gst_structure_get_name(structure); const gchar* format = gst_structure_get_string(structure, "format"); if (!name || !format) { close(); return FrameSource::CLOSED; } // we support 2 types of data: // video/x-raw, format=BGR -> 8bit, 3 channels // video/x-raw, format=GRAY8 -> 8bit, 1 channel if (strcasecmp(name, "video/x-raw") == 0) { if (strcasecmp(format, "RGB") == 0) { depth = 3; } else if(strcasecmp(format, "GRAY8") == 0) { depth = 1; } } #endif if (depth == 0) { close(); return FrameSource::CLOSED; } vx_imagepatch_addressing_t decodedImageAddr; decodedImageAddr.dim_x = width; decodedImageAddr.dim_y = height; decodedImageAddr.stride_x = depth; // GStreamer uses as stride width rounded up to the nearest multiple of 4 decodedImageAddr.stride_y = ((width*depth+3)/4)*4; decodedImageAddr.scale_x = 1; decodedImageAddr.scale_y = 1; vx_image decodedImage = NULL; vx_df_image_e vx_type_map[5] = { VX_DF_IMAGE_VIRT, VX_DF_IMAGE_U8, VX_DF_IMAGE_VIRT, VX_DF_IMAGE_RGB, VX_DF_IMAGE_RGBX }; // fetch image width and height vx_uint32 actual_width, actual_height; vx_df_image_e actual_format; NVXIO_SAFE_CALL( vxQueryImage(image, VX_IMAGE_ATTRIBUTE_WIDTH, (void *)&actual_width, sizeof(actual_width)) ); NVXIO_SAFE_CALL( vxQueryImage(image, VX_IMAGE_ATTRIBUTE_HEIGHT, (void *)&actual_height, sizeof(actual_height)) ); NVXIO_SAFE_CALL( vxQueryImage(image, VX_IMAGE_ATTRIBUTE_FORMAT, (void *)&actual_format, sizeof(actual_format)) ); bool needScale = width != (int)configuration.frameWidth || height != (int)configuration.frameHeight; // config and actual image sized must be the same! if ((actual_height != configuration.frameHeight) || (actual_width != configuration.frameWidth) || (actual_format != configuration.format)) { close(); NVXIO_THROW_EXCEPTION("Actual image [ " << actual_width << " x " << actual_height << " ] does not equal configuration one [ " << configuration.frameWidth << " x " << configuration.frameHeight << " ]"); } // we assume that decoced image will have no more than 3 channels per pixel if (!devMem) { NVXIO_ASSERT( cudaSuccess == cudaMallocPitch(&devMem, &devMemPitch, width * 3, height) ); } // check if decoded image format has changed if (scaledImage) { vx_df_image_e scaled_format; NVXIO_SAFE_CALL( vxQueryImage(scaledImage, VX_IMAGE_ATTRIBUTE_FORMAT, (void *)&scaled_format, sizeof(scaled_format)) ); if (scaled_format != vx_type_map[depth]) { vxReleaseImage(&scaledImage); scaledImage = NULL; } } if (needScale && !scaledImage) { scaledImage = vxCreateImage(vxContext, configuration.frameWidth, configuration.frameHeight, vx_type_map[depth]); NVXIO_CHECK_REFERENCE( scaledImage ); } #if GST_VERSION_MAJOR == 0 bool needConvert = configuration.format != VX_DF_IMAGE_RGB; void * decodedPtr = GST_BUFFER_DATA(buffer); #else GstMapInfo info; gboolean success = gst_buffer_map(buffer, &info, (GstMapFlags)GST_MAP_READ); if (!success) { printf("GStreamer: unable to map buffer\n"); close(); return FrameSource::CLOSED; } bool needConvert = configuration.format != vx_type_map[depth]; void * decodedPtr = info.data; #endif if (!needConvert && !needScale) { decodedImage = vxCreateImageFromHandle(vxContext, vx_type_map[depth], &decodedImageAddr, &decodedPtr, VX_IMPORT_TYPE_HOST); NVXIO_CHECK_REFERENCE( decodedImage ); NVXIO_SAFE_CALL( nvxuCopyImage(vxContext, decodedImage, image) ); } else { // 1. upload decoced image to CUDA buffer NVXIO_ASSERT( cudaSuccess == cudaMemcpy2D(devMem, devMemPitch, decodedPtr, decodedImageAddr.stride_y, decodedImageAddr.dim_x * depth, decodedImageAddr.dim_y, cudaMemcpyHostToDevice) ); // 2. create vx_image wrapper for decoded buffer decodedImageAddr.stride_y = static_cast<vx_int32>(devMemPitch); decodedImage = vxCreateImageFromHandle(vxContext, vx_type_map[depth], &decodedImageAddr, &devMem, NVX_IMPORT_TYPE_CUDA); NVXIO_CHECK_REFERENCE( decodedImage ); if (needScale) { // 3. scale image NVXIO_SAFE_CALL( vxuScaleImage(vxContext, decodedImage, scaledImage, VX_INTERPOLATION_TYPE_BILINEAR) ); // 4. convert to dst image NVXIO_SAFE_CALL( vxuColorConvert(vxContext, scaledImage, image) ); } else { // 3. convert to dst image NVXIO_SAFE_CALL( vxuColorConvert(vxContext, decodedImage, image) ); } } #if GST_VERSION_MAJOR != 0 gst_buffer_unmap(buffer, &info); #endif NVXIO_SAFE_CALL( vxReleaseImage(&decodedImage) ); return FrameSource::OK; }
void InitCudaLayers() { mmGridSizeX = sim_width/blockSizex; mmGridSizeY = sim_height/blockSizey; mmGridSize = mmGridSizeX*mmGridSizeY; memset(mmGrid, 0, sizeof(mmGrid)); memset(mmYGGrid, 0, sizeof(mmYGGrid)); tempHostData = (float*)malloc(sim_width*sim_height*TEMP_HOST_ELEM*sizeof(float)); tempHostDataNoCuda = (float*)malloc(sim_width*sim_height*TEMP_HOST_ELEM*sizeof(float)); grid8ValTick = (float*)malloc(sim_width*sim_height*8*sizeof(float)); initColors(); memset(gCudaLayer, 0, sizeof(gCudaLayer)); memset(gCudaFuncLayer, 0, sizeof(gCudaFuncLayer)); memset(gPhysLayer, 0, sizeof(gPhysLayer)); memset(gStateLayer, 0, sizeof(gStateLayer)); srand(0); int seed = rand(); const cudaChannelFormatDesc desc4 = cudaCreateChannelDesc<float4>(); cudaMallocArray(&gCudaVectArray, &desc4, sim_width, sim_height); #if NFLAYERS ==2 const cudaChannelFormatDesc desc2 = cudaCreateChannelDesc<float2>(); #else if NFLAYERS ==4 const cudaChannelFormatDesc descF = desc4; #endif cudaMallocArray(&gCudaFlArray, &descF, sim_width, sim_height); const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>(); cudaMallocArray(&gCudaFuncWavePack, &desc, sim_width); cudaMallocArray(&gCudaFuncSmooth, &desc, sim_width); cudaMallocArray(&(gCudaLayer[0]), &desc, sim_width, sim_height); cudaMallocArray(&(gCudaLayer[1]), &desc, sim_width, sim_height); cudaMallocArray(&(gCudaFuncLayer[0]), &desc, sim_width, sim_height); cudaMalloc(&cuTempData, TEMP_SIZE*sizeof(float)*sim_width*sim_height); cudaMalloc(&cuRandArr, sizeof(unsigned int)*sim_width*sim_height); cudaMalloc(&gStateLayer[0], sim_rect*sizeof(float)); cudaMemset(gStateLayer[0], 0, sim_rect*sizeof(float)); cudaMalloc(&gStateLayer[1], sim_rect*sizeof(float)); cudaMemset(gStateLayer[1], 0, sim_rect*sizeof(float)); cudaMalloc(&gPhysLayer[0], sim_rect*sizeof(float)); cudaMemset(gPhysLayer[0], 0, sim_rect*sizeof(float)); cudaMalloc(&gPhysLayer[1], sim_rect*sizeof(float)); cudaMemset(gPhysLayer[1], 0, sim_rect*sizeof(float)); cudaMalloc(&gRedBlueField, NFLAYERS*sim_rect*sizeof(float)); cudaMemset(gRedBlueField, 0, NFLAYERS*sim_rect*sizeof(float)); size_t pitch = 4*sim_width*sizeof(float); cudaMallocPitch((void**)&gVectorLayer, &pitch, 4*sim_width*sizeof(float), sim_height); cudaMemset2D(gVectorLayer, 4*sim_width*sizeof(float), 0, 4*sim_width*sizeof(float), sim_height); InitWavePack(32, 1.f, sim_width, sim_height, cuTempData, gCudaFuncWavePack); InitSmooth(1, sim_width, cuTempData, gCudaFuncSmooth); InitRnd2DInt(seed, cuRandArr, sim_width, sim_height); InitFuncLayer(gCudaFuncLayer[0], cuTempData, sim_width, sim_height); InitPhysLayer(gPhysLayer[0], gStateLayer[0], cuRandArr, sim_width, sim_height); float* gridIni = cuTempData+3*sim_rect/2; float* halfTemp = cuTempData + sim_rect; float* out = cuTempData + 2*sim_rect; cudaMemset(out, 0, sim_rect*sizeof(float)); seed = rand(); int gridx = INTERP_SIZEX; int gridy = INTERP_SIZEX; InitRnd2DF(seed, gridIni, gridx, gridy); float scaleadd = .7f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); seed = rand(); gridx = (int)(gridx*2); gridy = (int)(gridy*2); InitRnd2DF(seed, gridIni, gridx, gridy); scaleadd = .3f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); cudaMemcpyToArray(gCudaLayer[0], 0, 0, out, sizeof(float)*sim_rect, cudaMemcpyDeviceToDevice); cudaMemset(out, 0, sim_rect*sizeof(float)); gridx = INTERP_SIZEX; gridy = INTERP_SIZEX; seed = rand(); InitRnd2DF(seed, gridIni, gridx, gridy); scaleadd = .7f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); seed = rand(); gridx = (int)(gridx*1.5); gridy = (int)(gridy*1.5); InitRnd2DF(seed, gridIni, gridx, gridy); scaleadd = .3f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); cudaMemcpyToArray(gCudaLayer[1], 0, 0, out, sizeof(float)*sim_rect, cudaMemcpyDeviceToDevice); float2 pos0; pos0.x = gObj0X; pos0.y = gObj0Y; float2 pos1; pos1.x = gObj1X; pos1.y = gObj1Y; gObjInertia.Init(pos0, pos1); LayerProc(sim_width, sim_height, gCudaLayer[0], gCudaFuncLayer[0], cuTempData, pos0.x , pos0.y, pos1.x , pos1.y); ParticleStateInit(cuTempData, cuRandArr, gStateLayer[0], gPhysLayer[0], gRedBlueField); InitBhv(); }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s devID = findCudaGLDevice(argc, (const char **)argv); // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); initParticles(particles, DIM, DIM); // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); if (ref_file) { autoTest(argv); cleanup(); cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { atexit(cleanup); glutMainLoop(); } cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); cudaDeviceReset(); exit(EXIT_FAILURE); }
void convertFrame(vx_context vxContext, vx_image frame, const FrameSource::Parameters & configuration, vx_imagepatch_addressing_t & decodedImageAddr, void * decodedPtr, bool is_cuda, void *& devMem, size_t & devMemPitch, vx_image & scaledImage ) { vx_df_image_e vx_type_map[5] = { VX_DF_IMAGE_VIRT, VX_DF_IMAGE_U8, VX_DF_IMAGE_VIRT, VX_DF_IMAGE_RGB, VX_DF_IMAGE_RGBX }; vx_df_image_e decodedFormat = vx_type_map[decodedImageAddr.stride_x]; // fetch image width and height vx_uint32 frameWidth, frameHeight; vx_df_image_e frameFormat; NVXIO_SAFE_CALL( vxQueryImage(frame, VX_IMAGE_ATTRIBUTE_WIDTH, (void *)&frameWidth, sizeof(frameWidth)) ); NVXIO_SAFE_CALL( vxQueryImage(frame, VX_IMAGE_ATTRIBUTE_HEIGHT, (void *)&frameHeight, sizeof(frameHeight)) ); NVXIO_SAFE_CALL( vxQueryImage(frame, VX_IMAGE_ATTRIBUTE_FORMAT, (void *)&frameFormat, sizeof(frameFormat)) ); bool needScale = frameWidth != decodedImageAddr.dim_x || frameHeight != decodedImageAddr.dim_y; bool needConvert = frameFormat != decodedFormat; // config and actual image sized must be the same! if ((frameWidth != configuration.frameWidth) || (frameHeight != configuration.frameHeight)) { NVXIO_THROW_EXCEPTION("Actual image [ " << frameWidth << " x " << frameHeight << " ] is not equal to configuration one [ " << configuration.frameWidth << " x " << configuration.frameHeight << " ]"); } // allocate CUDA memory to copy decoded image to if (!is_cuda) { if (!devMem) { // we assume that decoded image will have no more than 4 channels per pixel NVXIO_ASSERT( cudaSuccess == cudaMallocPitch(&devMem, &devMemPitch, decodedImageAddr.dim_x * 4, decodedImageAddr.dim_y) ); } } // check if decoded image format has changed if (scaledImage) { vx_df_image_e scaledFormat; NVXIO_SAFE_CALL( vxQueryImage(scaledImage, VX_IMAGE_ATTRIBUTE_FORMAT, (void *)&scaledFormat, sizeof(scaledFormat)) ); if (scaledFormat != decodedFormat) { NVXIO_SAFE_CALL( vxReleaseImage(&scaledImage) ); scaledImage = NULL; } } if (needScale && !scaledImage) { scaledImage = vxCreateImage(vxContext, frameWidth, frameHeight, decodedFormat); NVXIO_CHECK_REFERENCE( scaledImage ); } vx_image decodedImage = NULL; // 1. create vx_image wrapper if (is_cuda) { // a. create vx_image wrapper from CUDA pointer decodedImage = vxCreateImageFromHandle(vxContext, decodedFormat, &decodedImageAddr, &decodedPtr, NVX_IMPORT_TYPE_CUDA); } else { // a. upload decoded image to CUDA buffer NVXIO_ASSERT( cudaSuccess == cudaMemcpy2D(devMem, devMemPitch, decodedPtr, decodedImageAddr.stride_y, decodedImageAddr.dim_x * decodedImageAddr.stride_x, decodedImageAddr.dim_y, cudaMemcpyHostToDevice) ); // b. create vx_image wrapper for decoded buffer decodedImageAddr.stride_y = static_cast<vx_int32>(devMemPitch); decodedImage = vxCreateImageFromHandle(vxContext, decodedFormat, &decodedImageAddr, &devMem, NVX_IMPORT_TYPE_CUDA); } NVXIO_CHECK_REFERENCE( decodedImage ); // 2. scale if necessary if (needScale) { // a. scale image NVXIO_SAFE_CALL( vxuScaleImage(vxContext, decodedImage, scaledImage, VX_INTERPOLATION_TYPE_BILINEAR) ); } else { scaledImage = decodedImage; } // 3. convert / copy to dst image if (needConvert) { NVXIO_SAFE_CALL( vxuColorConvert(vxContext, scaledImage, frame) ); } else { NVXIO_SAFE_CALL( nvxuCopyImage(vxContext, scaledImage, frame) ); } if (!needScale) scaledImage = NULL; NVXIO_SAFE_CALL( vxReleaseImage(&decodedImage) ); }